Skip to content

Commit 8c67a4c

Browse files
authored
Merge branch 'main' into pre-commit-ci-update-config
2 parents 27c365a + f0d250b commit 8c67a4c

File tree

10 files changed

+94
-25
lines changed

10 files changed

+94
-25
lines changed

.github/workflows/ci.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ jobs:
2222

2323
steps:
2424
- name: Checkout source
25-
uses: actions/checkout@v4
25+
uses: actions/checkout@v5
2626
with:
2727
submodules: recursive
2828
fetch-depth: 0 # required for version resolution
2929

3030
- name: Set up Conda
31-
uses: conda-incubator/setup-miniconda@v3.1.1
31+
uses: conda-incubator/setup-miniconda@v3.2.0
3232
with:
3333
channels: conda-forge
3434
miniforge-version: latest
@@ -78,7 +78,7 @@ jobs:
7878

7979
steps:
8080
- name: Checkout source
81-
uses: actions/checkout@v4
81+
uses: actions/checkout@v5
8282
with:
8383
submodules: recursive
8484
fetch-depth: 0 # required for version resolution

.github/workflows/codeql-analysis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jobs:
3232

3333
steps:
3434
- name: Checkout repository
35-
uses: actions/checkout@v4
35+
uses: actions/checkout@v5
3636

3737
# Initializes the CodeQL tools for scanning.
3838
- name: Initialize CodeQL

.github/workflows/wheel.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
# note: CIBW_ENVIRONMENT is now set in pyproject.toml
2929

3030
steps:
31-
- uses: actions/checkout@v4
31+
- uses: actions/checkout@v5
3232
with:
3333
submodules: true
3434
fetch-depth: 0 # required for version resolution for nightly wheels
@@ -44,7 +44,7 @@ jobs:
4444
name: Build source distribution
4545
runs-on: ubuntu-latest
4646
steps:
47-
- uses: actions/checkout@v4
47+
- uses: actions/checkout@v5
4848
with:
4949
submodules: true
5050
fetch-depth: 0

docs/release.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ Improvements
2727
By :user:`John Kirkham <jakirkham>`, :issue:`723`
2828
* All codecs are now pickleable.
2929
By :user:`Tom Nicholas <TomNicholas>`, :issue:`744`
30+
* The Zstandard codec can now decode bytes containing multiple frames
31+
By :user:`Mark Kittisopikul <mkitti>`, :issue:`757`
3032

3133
Fixes
3234
~~~~~

numcodecs/pcodec.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Literal
22

33
from numcodecs.abc import Codec
4-
from numcodecs.compat import ensure_contiguous_ndarray
4+
from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray
55
from pcodec import ChunkConfig, DeltaSpec, ModeSpec, PagingSpec, standalone
66

77
DEFAULT_MAX_PAGE_N = 262144
@@ -110,6 +110,7 @@ def encode(self, buf):
110110
return standalone.simple_compress(buf, config)
111111

112112
def decode(self, buf, out=None):
113+
buf = ensure_bytes(buf)
113114
if out is not None:
114115
out = ensure_contiguous_ndarray(out)
115116
standalone.simple_decompress_into(buf, out)

numcodecs/tests/common.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -150,16 +150,6 @@ def check_encode_decode_array(arr, codec):
150150
assert_array_items_equal(arr, dec)
151151

152152

153-
def check_encode_decode_array_to_bytes(arr, codec):
154-
enc = codec.encode(arr)
155-
dec = codec.decode(enc)
156-
assert_array_items_equal(arr, dec)
157-
158-
out = np.empty_like(arr)
159-
codec.decode(enc, out=out)
160-
assert_array_items_equal(arr, out)
161-
162-
163153
def check_config(codec):
164154
config = codec.get_config()
165155
# round-trip through JSON to check serialization

numcodecs/tests/test_pcodec.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from numcodecs.tests.common import (
1010
check_backwards_compatibility,
1111
check_config,
12-
check_encode_decode_array_to_bytes,
12+
check_encode_decode_array,
1313
check_err_decode_object_buffer,
1414
check_err_encode_object_buffer,
1515
check_repr,
@@ -49,7 +49,7 @@
4949
@pytest.mark.parametrize("arr", arrays)
5050
@pytest.mark.parametrize("codec", codecs)
5151
def test_encode_decode(arr, codec):
52-
check_encode_decode_array_to_bytes(arr, codec)
52+
check_encode_decode_array(arr, codec)
5353

5454

5555
def test_config():
@@ -61,13 +61,13 @@ def test_config():
6161
def test_invalid_config_error(param):
6262
codec = PCodec(**{param: "bogus"})
6363
with pytest.raises(ValueError):
64-
check_encode_decode_array_to_bytes(arrays[0], codec)
64+
check_encode_decode_array(arrays[0], codec)
6565

6666

6767
def test_invalid_delta_encoding_combo():
6868
codec = PCodec(delta_encoding_order=2, delta_spec="none")
6969
with pytest.raises(ValueError):
70-
check_encode_decode_array_to_bytes(arrays[0], codec)
70+
check_encode_decode_array(arrays[0], codec)
7171

7272

7373
def test_repr():

numcodecs/tests/test_pyzstd.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ def test_pyzstd_simple(input):
2525
assert pyzstd.decompress(z.encode(input)) == input
2626

2727

28-
@pytest.mark.xfail
2928
@pytest.mark.parametrize("input", test_data)
3029
def test_pyzstd_simple_multiple_frames_decode(input):
3130
"""

numcodecs/tests/test_zstd.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,34 @@ def zstd_cli_available() -> bool:
156156
return not subprocess.run(
157157
["zstd", "-V"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
158158
).returncode
159+
160+
161+
def test_multi_frame():
162+
codec = Zstd()
163+
164+
hello_world = codec.encode(b"Hello world!")
165+
assert codec.decode(hello_world) == b"Hello world!"
166+
assert codec.decode(hello_world * 2) == b"Hello world!Hello world!"
167+
168+
hola = codec.encode(b"Hola ")
169+
mundo = codec.encode(b"Mundo!")
170+
assert codec.decode(hola) == b"Hola "
171+
assert codec.decode(mundo) == b"Mundo!"
172+
assert codec.decode(hola + mundo) == b"Hola Mundo!"
173+
174+
bytes_val = b'(\xb5/\xfd\x00Xa\x00\x00Hello World!'
175+
dec = codec.decode(bytes_val)
176+
dec_expected = b'Hello World!'
177+
assert dec == dec_expected
178+
cli = zstd_cli_available()
179+
if cli:
180+
assert bytes_val == generate_zstd_streaming_bytes(dec_expected)
181+
assert dec_expected == generate_zstd_streaming_bytes(bytes_val, decompress=True)
182+
183+
# Concatenate frames of known sizes and unknown sizes
184+
# unknown size frame at the end
185+
assert codec.decode(hola + mundo + bytes_val) == b"Hola Mundo!Hello World!"
186+
# unknown size frame at the beginning
187+
assert codec.decode(bytes_val + hola + mundo) == b"Hello World!Hola Mundo!"
188+
# unknown size frame in the middle
189+
assert codec.decode(hola + bytes_val + mundo) == b"Hola Hello World!Mundo!"

numcodecs/zstd.pyx

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,12 @@ cdef extern from "zstd.h":
7171
size_t ZSTD_freeDStream(ZSTD_DStream* zds) nogil
7272
size_t ZSTD_initDStream(ZSTD_DStream* zds) nogil
7373

74-
cdef long ZSTD_CONTENTSIZE_UNKNOWN
75-
cdef long ZSTD_CONTENTSIZE_ERROR
74+
cdef unsigned long long ZSTD_CONTENTSIZE_UNKNOWN
75+
cdef unsigned long long ZSTD_CONTENTSIZE_ERROR
76+
7677
unsigned long long ZSTD_getFrameContentSize(const void* src,
7778
size_t srcSize) nogil
79+
size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) nogil
7880

7981
int ZSTD_minCLevel() nogil
8082
int ZSTD_maxCLevel() nogil
@@ -218,7 +220,11 @@ def decompress(source, dest=None):
218220

219221
try:
220222
# determine uncompressed size using unsigned long long for full range
221-
content_size = ZSTD_getFrameContentSize(source_ptr, source_size)
223+
try:
224+
content_size = findTotalContentSize(source_ptr, source_size)
225+
except RuntimeError:
226+
raise RuntimeError('Zstd decompression error: invalid input data')
227+
222228
if content_size == ZSTD_CONTENTSIZE_UNKNOWN and dest is None:
223229
return stream_decompress(source_pb)
224230
elif content_size == ZSTD_CONTENTSIZE_UNKNOWN:
@@ -362,6 +368,46 @@ cdef stream_decompress(const Py_buffer* source_pb):
362368

363369
return dest
364370

371+
cdef unsigned long long findTotalContentSize(const char* source_ptr, size_t source_size):
372+
"""Find the total uncompressed content size of all frames in the source buffer
373+
374+
Parameters
375+
----------
376+
source_ptr : Pointer to the beginning of the buffer
377+
source_size : Size of the buffer containing the frame sizes to sum
378+
379+
Returns
380+
-------
381+
total_content_size: Sum of the content size of all frames within the source buffer
382+
If any of the frame sizes is unknown, return ZSTD_CONTENTSIZE_UNKNOWN.
383+
If any of the frames causes ZSTD_getFrameContentSize to error, return ZSTD_CONTENTSIZE_ERROR.
384+
"""
385+
cdef:
386+
unsigned long long frame_content_size = 0
387+
unsigned long long total_content_size = 0
388+
size_t frame_compressed_size = 0
389+
size_t offset = 0
390+
391+
while offset < source_size:
392+
frame_compressed_size = ZSTD_findFrameCompressedSize(source_ptr + offset, source_size - offset);
393+
394+
if ZSTD_isError(frame_compressed_size):
395+
error = ZSTD_getErrorName(frame_compressed_size)
396+
raise RuntimeError('Could not set determine zstd frame size: %s' % error)
397+
398+
frame_content_size = ZSTD_getFrameContentSize(source_ptr + offset, frame_compressed_size);
399+
400+
if frame_content_size == ZSTD_CONTENTSIZE_ERROR:
401+
return ZSTD_CONTENTSIZE_ERROR
402+
403+
if frame_content_size == ZSTD_CONTENTSIZE_UNKNOWN:
404+
return ZSTD_CONTENTSIZE_UNKNOWN
405+
406+
total_content_size += frame_content_size
407+
offset += frame_compressed_size
408+
409+
return total_content_size
410+
365411
class Zstd(Codec):
366412
"""Codec providing compression using Zstandard.
367413

0 commit comments

Comments
 (0)