Skip to content
Open
202 changes: 187 additions & 15 deletions src/bmaptool/BmapCopy.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,16 +213,22 @@ class BmapCopy(object):
instance.
"""

def __init__(self, image, dest, bmap=None, image_size=None):
def __init__(self, image, dest, bmap=None, image_size=None, checksum_retry=None):
"""
The class constructor. The parameters are:
image - file-like object of the image which should be copied,
should only support 'read()' and 'seek()' methods,
and only seeking forward has to be supported.
dest - file object of the destination file to copy the image
to.
bmap - file object of the bmap file to use for copying.
image_size - size of the image in bytes.
image - file-like object of the image which should be copied,
should only support 'read()' and 'seek()' methods,
and only seeking forward has to be supported.
dest - file object of the destination file to copy the image
to.
bmap - file object of the bmap file to use for copying.
image_size - size of the image in bytes.
checksum_retry - number of retries for verifying written bmap ranges.
If set to a value > 0, each written mapped range
described in the bmap will be read back and its checksum
verified against the expected value from the bmap file.
If a mismatch is detected for a range, the entire range
will be rewritten up to checksum_retry times.
"""

self._xml = None
Expand Down Expand Up @@ -270,6 +276,10 @@ def __init__(self, image, dest, bmap=None, image_size=None):
self._cs_attrib_name = None
self._bmap_cs_attrib_name = None

# Checksum retry configuration for post-write verification
self._checksum_retry = checksum_retry
self._warned_missing_checksum = False

# Special quirk for /dev/null which does not support fsync()
if (
stat.S_ISCHR(st_data.st_mode)
Expand Down Expand Up @@ -615,10 +625,12 @@ def _get_batches(self, first, last):
def _get_data(self, verify):
"""
This is generator which reads the image file in '_batch_blocks' chunks
and yields ('type', 'start', 'end', 'buf) tuples, where:
and yields ('type', 'start', 'end', 'buf', 'range_first', 'range_last', 'range_chksum')
tuples, where:
* 'start' is the starting block number of the batch;
* 'end' is the last block of the batch;
* 'buf' a buffer containing the batch data.
* 'buf' a buffer containing the batch data;
* 'range_first', 'range_last', 'range_chksum' are the bmap range info for post-write verification.
"""

_log.debug("the reader thread has started")
Expand Down Expand Up @@ -655,7 +667,7 @@ def _get_data(self, verify):
% (blocks, self._batch_queue.qsize())
)

self._batch_queue.put(("range", start, start + blocks - 1, buf))
self._batch_queue.put(("range", start, start + blocks - 1, buf, first, last, chksum))

if verify and chksum and hash_obj.hexdigest() != chksum:
raise Error(
Expand All @@ -673,6 +685,133 @@ def _get_data(self, verify):

self._batch_queue.put(None)

def _verify_written_blocks(self, start, end, expected_chksum):
"""
Verify the checksum of blocks that were written to the destination file.
Returns True if the checksum matches, False otherwise.

Args:
start - starting block number
end - ending block number
expected_chksum - expected checksum string

Returns:
True if checksum matches, False if mismatch
"""
if not self._cs_type or not expected_chksum:
return True

try:
self._f_dest.seek(start * self.block_size)
buf = self._f_dest.read((end - start + 1) * self.block_size)

if not buf:
return False

hash_obj = hashlib.new(self._cs_type)
hash_obj.update(buf)

calculated = hash_obj.hexdigest()
return calculated == expected_chksum
except IOError as err:
_log.error(
"error while verifying blocks %d-%d of '%s': %s",
start, end, self._dest_path, err
)
return False

def _drop_cached_blocks(self, start, end):
"""
Best-effort invalidation of destination page cache for block range.
This helps make checksum verification read data back from storage
rather than returning cached pages.
"""

if not hasattr(os, "posix_fadvise") or not hasattr(
os, "POSIX_FADV_DONTNEED"
):
return

offset = start * self.block_size
length = (end - start + 1) * self.block_size

try:
os.posix_fadvise(
self._f_dest.fileno(), offset, length, os.POSIX_FADV_DONTNEED
)
except OSError as err:
_log.debug(
"cannot drop page cache for blocks %d-%d of '%s': %s",
start,
end,
self._dest_path,
err,
)

def _verify_range_with_retry(self, range_first, range_last, range_chksum, range_buffers):
"""
Verify a block range's checksum after writing. If verification fails,
retry writing the range up to self._checksum_retry times.

Args:
range_first - first block of the range
range_last - last block of the range
range_chksum - expected checksum for the range
range_buffers - dict of {(start, end): buf} for blocks in this range
"""
if self._checksum_retry and not range_chksum and not self._warned_missing_checksum:
_log.warning(
"checksum-retry requested but bmap file does not contain checksums; "
"skipping verification for blocks %d-%d and beyond"
Comment on lines +784 to +785
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The warning in _verify_range_with_retry() says the bmap file lacks checksums, but this path can also happen when no bmap file is provided at all (e.g., API use or --nobmap). Consider adjusting the message/logic to distinguish “no bmap” from “bmap without checksums” to avoid misleading users.

Suggested change
"checksum-retry requested but bmap file does not contain checksums; "
"skipping verification for blocks %d-%d and beyond"
"checksum-retry requested but no checksum information is available "
"(no bmap file or bmap without checksums); skipping verification "
"for blocks %d-%d and beyond"

Copilot uses AI. Check for mistakes.
% (range_first, range_last)
)
self._warned_missing_checksum = True

if not range_chksum or not self._checksum_retry:
return

retry_count = 0
retry_limit = int(self._checksum_retry)

while True:
# Sync to disk before reading back for verification
# This ensures we verify data that has actually reached the physical
# disk, not just what's in the kernel's page cache
self.sync()
self._drop_cached_blocks(range_first, range_last)

# Verify the entire range
if self._verify_written_blocks(range_first, range_last, range_chksum):
_log.debug(
"checksum verification passed for blocks %d-%d"
% (range_first, range_last)
)
return

# Checksum mismatch - retry
retry_count += 1
if retry_count > retry_limit:
raise Error(
"checksum verification failed for blocks %d-%d after "
"%d retries" % (range_first, range_last, retry_limit)
)

_log.warning(
"checksum mismatch for blocks %d-%d, retrying "
"(attempt %d/%d)" % (range_first, range_last, retry_count, retry_limit)
)

# Re-write all blocks in this range
for (start, end), buf in range_buffers.items():
try:
self._f_dest.seek(start * self.block_size)
self._f_dest.write(buf)
except IOError as err:
raise Error(
"error while writing blocks %d-%d of '%s': %s"
% (start, end, self._dest_path, err)
)

def copy(self, sync=True, verify=True):
"""
Copy the image to the destination file using bmap. The 'sync' argument
Expand Down Expand Up @@ -704,22 +843,46 @@ def copy(self, sync=True, verify=True):

# Read the image in '_batch_blocks' chunks and write them to the
# destination file
range_buffers = {} # Track buffers for each range to enable retry
current_range = None

while True:
batch = self._batch_queue.get()
if batch is None:
# No more data, the image is written
# Verify any remaining range
if self._checksum_retry and current_range:
range_first, range_last, range_chksum = current_range
if range_chksum and self._cs_type:
self._verify_range_with_retry(
range_first, range_last, range_chksum, range_buffers
)
break
elif batch[0] == "error":
# The reader thread encountered an error and passed us the
# exception.
exc_info = batch[1]
raise exc_info[1]

(start, end, buf) = batch[1:4]
(start, end, buf, range_first, range_last, range_chksum) = batch[1:7]

assert len(buf) <= (end - start + 1) * self.block_size
assert len(buf) > (end - start) * self.block_size

# Check if we've moved to a new range - if so, verify the previous one
new_range = (range_first, range_last, range_chksum)
if self._checksum_retry and current_range and new_range != current_range:
# Verify the completed range before moving to the next one
prev_first, prev_last, prev_chksum = current_range
if prev_chksum and self._cs_type:
self._verify_range_with_retry(
prev_first, prev_last, prev_chksum, range_buffers
)
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The warning path for --checksum-retry when the bmap has no per-range checksums never triggers because callers only invoke _verify_range_with_retry() when range_chksum is truthy. As a result, --checksum-retry becomes a silent no-op for checksum-less bmaps (despite the CLI log message). Call _verify_range_with_retry() unconditionally when finishing a range (let it decide whether to warn/skip), or move the missing-checksum detection outside the if prev_chksum ... guard.

Copilot uses AI. Check for mistakes.
# Reset for new range
range_buffers = {}

current_range = new_range

self._f_dest.seek(start * self.block_size)

# Synchronize the destination file if we reached the watermark
Expand All @@ -740,6 +903,10 @@ def copy(self, sync=True, verify=True):
blocks_written += end - start + 1
bytes_written += len(buf)

# Track buffers for this range (for retry if needed)
if self._checksum_retry:
range_buffers[(start, end)] = buf

self._update_progress(blocks_written)

if not self.image_size:
Expand Down Expand Up @@ -785,7 +952,12 @@ def sync(self):

if self._dest_supports_fsync:
try:
os.fsync(self._f_dest.fileno()),
self._f_dest.flush()
except IOError as err:
raise Error("cannot flush '%s': %s" % (self._dest_path, err))

try:
os.fsync(self._f_dest.fileno())
except OSError as err:
raise Error(
"cannot synchronize '%s': %s " % (self._dest_path, err.strerror)
Expand All @@ -800,14 +972,14 @@ class BmapBdevCopy(BmapCopy):
scheduler.
"""

def __init__(self, image, dest, bmap=None, image_size=None):
def __init__(self, image, dest, bmap=None, image_size=None, checksum_retry=None):
"""
The same as the constructor of the 'BmapCopy' base class, but adds
useful guard-checks specific to block devices.
"""

# Call the base class constructor first
BmapCopy.__init__(self, image, dest, bmap, image_size)
BmapCopy.__init__(self, image, dest, bmap, image_size, checksum_retry)

self._dest_fsync_watermark = (6 * 1024 * 1024) // self.block_size

Expand Down
55 changes: 49 additions & 6 deletions src/bmaptool/CLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def __getattr__(self, name):
return getattr(self._file_obj, name)


def open_block_device(path):
def open_block_device(path, need_read_access=False):
"""
This is a helper function for 'open_files()' which is called if the
destination file of the "copy" command is a block device. We handle block
Expand All @@ -115,17 +115,25 @@ def open_block_device(path):
that we are the only users of the block device.

This function opens a block device specified by 'path' in exclusive mode.
The 'need_read_access' parameter controls whether the device is opened
with read+write (True) or write-only (False) permissions.
Returns opened file object.
"""

try:
descriptor = os.open(path, os.O_WRONLY | os.O_EXCL)
if need_read_access:
descriptor = os.open(path, os.O_RDWR | os.O_EXCL)
else:
descriptor = os.open(path, os.O_WRONLY | os.O_EXCL)
except OSError as err:
error_out("cannot open block device '%s' in exclusive mode: %s", path, err)

# Turn the block device file descriptor into a file object
try:
file_obj = os.fdopen(descriptor, "wb")
if need_read_access:
file_obj = os.fdopen(descriptor, "r+b")
else:
file_obj = os.fdopen(descriptor, "wb")
except OSError as err:
os.close(descriptor)
error_out("cannot open block device '%s':\n%s", path, err)
Expand Down Expand Up @@ -586,7 +594,9 @@ def open_files(args):
try:
if pathlib.Path(args.dest).is_block_device():
dest_is_blkdev = True
dest_obj = open_block_device(args.dest)
# Request read access only if checksum-retry is enabled
need_read = bool(args.checksum_retry)
dest_obj = open_block_device(args.dest, need_read_access=need_read)
else:
dest_obj = open(args.dest, "wb+")
except IOError as err:
Expand All @@ -610,6 +620,22 @@ def copy_command(args):
if args.no_sig_verify and args.fingerprint:
error_out("--no-sig-verify and --fingerprint cannot be used together")

# Validate checksum_retry argument
checksum_retry = None
if args.checksum_retry is not None:
try:
checksum_retry = int(args.checksum_retry)
if checksum_retry < 1:
error_out("--checksum-retry argument must be a positive integer")
except ValueError:
error_out("--checksum-retry argument must be a valid integer")

Comment on lines +623 to +632
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

--checksum-retry relies on bmap range checksums; when --nobmap is used the option becomes a no-op (and block devices may still be opened read-write). Consider rejecting --checksum-retry together with --nobmap (and/or when no bmap file is found) with a clear error message.

Copilot uses AI. Check for mistakes.
if checksum_retry:
log.info(
"checksum verification of written blocks enabled with up to %d retry attempts"
% checksum_retry
)

image_obj, dest_obj, bmap_obj, bmap_path, image_size, dest_is_blkdev = open_files(
args
)
Expand All @@ -631,10 +657,14 @@ def copy_command(args):
if dest_is_blkdev:
dest_str = "block device '%s'" % args.dest
# For block devices, use the specialized class
writer = BmapCopy.BmapBdevCopy(image_obj, dest_obj, bmap_obj, image_size)
writer = BmapCopy.BmapBdevCopy(
image_obj, dest_obj, bmap_obj, image_size, checksum_retry
)
else:
dest_str = "file '%s'" % os.path.basename(args.dest)
writer = BmapCopy.BmapCopy(image_obj, dest_obj, bmap_obj, image_size)
writer = BmapCopy.BmapCopy(
image_obj, dest_obj, bmap_obj, image_size, checksum_retry
)
except BmapCopy.Error as err:
error_out(err)

Expand Down Expand Up @@ -855,6 +885,19 @@ def parse_arguments():
text = "do not verify the data checksum while writing"
parser_copy.add_argument("--no-verify", action="store_true", help=text)

# The --checksum-retry option
text = (
"verify checksums of written blocks and retry writing on mismatch "
"(optional: number of retries, default 1)"
)
parser_copy.add_argument(
"--checksum-retry",
nargs="?",
const="1",
type=str,
help=text,
)

# The --psplash-pipe option
text = "write progress to a psplash pipe"
parser_copy.add_argument("--psplash-pipe", help=text)
Expand Down
Loading
Loading