Skip to content

HADOOP-19604. ABFS: BlockId generation based on blockCount along with full blob md5 computation change #7777

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Jul 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
files="org[\\/]apache[\\/]hadoop[\\/]fs[\\/]azurebfs[\\/]services[\\/]AbfsClient.java"/>
<suppress checks="ParameterNumber"
files="org[\\/]apache[\\/]hadoop[\\/]fs[\\/]azurebfs[\\/]services[\\/]AbfsBlobClient.java"/>
<suppress checks="ParameterNumber"
files="org[\\/]apache[\\/]hadoop[\\/]fs[\\/]azurebfs[\\/]services[\\/]AbfsDfsClient.java"/>
<suppress checks="ParameterNumber|MagicNumber"
files="org[\\/]apache[\\/]hadoop[\\/]fs[\\/]azurebfs[\\/]services[\\/]VersionedFileStatus.java"/>
<suppress checks="ParameterNumber"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ public static ApiVersion getCurrentVersion() {
public static final String XML_TAG_RESOURCE_TYPE = "ResourceType";
public static final String XML_TAG_INVALID_XML = "Invalid XML";
public static final String XML_TAG_HDI_ISFOLDER = "hdi_isfolder";
public static final String XML_TAG_HDI_PERMISSION = "hdi_permission";
public static final String XML_TAG_ETAG = "Etag";
public static final String XML_TAG_LAST_MODIFIED_TIME = "Last-Modified";
public static final String XML_TAG_CREATION_TIME = "Creation-Time";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,36 @@ public final class FileSystemConfigurations {
*/
public static final int BLOCK_ID_LENGTH = 60;

/**
* Format string for generating block IDs.
* Example: "%s-%06d" where %s is the stream ID and %06d is the block index.
*/
public static final String BLOCK_ID_FORMAT = "%s-%06d";

/**
* Format string for padding block IDs.
* Example: "%-" specifies left alignment in the format string.
*/
public static final String PADDING_FORMAT = "%-";

/**
* Suffix for string formatting.
* Example: "s" specifies the type as a string in the format string.
*/
public static final String STRING_SUFFIX = "s";

/**
* Character used for padding spaces in block IDs.
* Example: ' ' represents a space character.
*/
public static final char SPACE_CHARACTER = ' ';

/**
* Character used for padding block IDs.
* Example: '_' is used to replace spaces in padded block IDs.
*/
public static final char PADDING_CHARACTER = '_';

/**
* Buffer blocks to disk.
* Capacity is limited to available disk space.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public enum Mode {
private boolean isExpectHeaderEnabled;
private boolean isRetryDueToExpect;
private BlobAppendRequestParameters blobParams;
private final String md5;


/**
Expand All @@ -48,14 +49,16 @@ public enum Mode {
* @param isAppendBlob true if the blob is append-blob
* @param leaseId leaseId of the blob to be appended
* @param isExpectHeaderEnabled true if the expect header is enabled
* @param md5 The Base64-encoded MD5 hash of the block for data integrity validation.
*/
public AppendRequestParameters(final long position,
final int offset,
final int length,
final Mode mode,
final boolean isAppendBlob,
final String leaseId,
final boolean isExpectHeaderEnabled) {
final boolean isExpectHeaderEnabled,
final String md5) {
this.position = position;
this.offset = offset;
this.length = length;
Expand All @@ -65,6 +68,7 @@ public AppendRequestParameters(final long position,
this.isExpectHeaderEnabled = isExpectHeaderEnabled;
this.isRetryDueToExpect = false;
this.blobParams = null;
this.md5 = md5;
}

/**
Expand All @@ -77,6 +81,7 @@ public AppendRequestParameters(final long position,
* @param leaseId leaseId of the blob to be appended
* @param isExpectHeaderEnabled true if the expect header is enabled
* @param blobParams parameters specific to append operation on Blob Endpoint.
* @param md5 The Base64-encoded MD5 hash of the block for data integrity validation.
*/
public AppendRequestParameters(final long position,
final int offset,
Expand All @@ -85,7 +90,8 @@ public AppendRequestParameters(final long position,
final boolean isAppendBlob,
final String leaseId,
final boolean isExpectHeaderEnabled,
final BlobAppendRequestParameters blobParams) {
final BlobAppendRequestParameters blobParams,
final String md5) {
this.position = position;
this.offset = offset;
this.length = length;
Expand All @@ -95,6 +101,7 @@ public AppendRequestParameters(final long position,
this.isExpectHeaderEnabled = isExpectHeaderEnabled;
this.isRetryDueToExpect = false;
this.blobParams = blobParams;
this.md5 = md5;
}

public long getPosition() {
Expand Down Expand Up @@ -146,6 +153,15 @@ public String getBlockId() {
return getBlobParams().getBlockId();
}

/**
* Gets the MD5 hash.
*
* @return the MD5 hash string
*/
public String getMd5() {
return md5;
}

/**
* Sets whether the retry is due to the Expect header.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.UUID;

import org.apache.commons.codec.binary.Base64;

import static org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.BLOCK_ID_LENGTH;
import static org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.BLOCK_ID_FORMAT;
import static org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.PADDING_CHARACTER;
import static org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.PADDING_FORMAT;
import static org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.SPACE_CHARACTER;
import static org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.STRING_SUFFIX;

/**
* Represents a block in Azure Blob Storage used by Azure Data Lake Storage (ADLS).
Expand All @@ -34,31 +39,50 @@
public class AbfsBlobBlock extends AbfsBlock {

private final String blockId;
private final long blockIndex;

/**
* Gets the activeBlock and the blockId.
*
* @param outputStream AbfsOutputStream Instance.
* @param offset Used to generate blockId based on offset.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add newly added parameter in method comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

taken

* @param blockIdLength the expected length of the generated block ID.
* @param blockIndex the index of the block; used in block ID generation.
* @throws IOException exception is thrown.
*/
AbfsBlobBlock(AbfsOutputStream outputStream, long offset) throws IOException {
AbfsBlobBlock(AbfsOutputStream outputStream, long offset, int blockIdLength, long blockIndex) throws IOException {
super(outputStream, offset);
this.blockId = generateBlockId(offset);
this.blockIndex = blockIndex;
String streamId = outputStream.getStreamID();
UUID streamIdGuid = UUID.nameUUIDFromBytes(streamId.getBytes(StandardCharsets.UTF_8));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can streamId be null? streamId.getBytes can raise null pointer exception. Better to handle it,

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

StreamId can never be null as this is set in constructor of AbfsOutputStream itself, this.outputStreamId = createOutputStreamId();

this.blockId = generateBlockId(streamIdGuid, blockIdLength);
}

/**
* Helper method that generates blockId.
* @param position The offset needed to generate blockId.
* @return String representing the block ID generated.
* Generates a Base64-encoded block ID string using the given stream UUID and block index.
* The block ID is first created as a raw string using a format with the stream ID and block index.
* If a non-zero rawLength is provided, the raw block ID is padded or trimmed to match the length.
* The final string is then Base64-encoded and returned.
*
* @param streamId the UUID of the stream used to generate the block ID.
* @param rawLength the desired length of the raw block ID string before encoding.
* If 0, no length adjustment is done.
* @return the Base64-encoded block ID string.
*/
private String generateBlockId(long position) {
String streamId = getOutputStream().getStreamID();
String streamIdHash = Integer.toString(streamId.hashCode());
String blockId = String.format("%d_%s", position, streamIdHash);
byte[] blockIdByteArray = new byte[BLOCK_ID_LENGTH];
System.arraycopy(blockId.getBytes(StandardCharsets.UTF_8), 0, blockIdByteArray, 0, Math.min(BLOCK_ID_LENGTH, blockId.length()));
return new String(Base64.encodeBase64(blockIdByteArray), StandardCharsets.UTF_8);
private String generateBlockId(UUID streamId, int rawLength) {
String rawBlockId = String.format(BLOCK_ID_FORMAT, streamId, blockIndex);

if (rawLength != 0) {
// Adjust to match expected decoded length
if (rawBlockId.length() < rawLength) {
rawBlockId = String.format(PADDING_FORMAT + rawLength + STRING_SUFFIX, rawBlockId)
.replace(SPACE_CHARACTER, PADDING_CHARACTER);
} else if (rawBlockId.length() > rawLength) {
rawBlockId = rawBlockId.substring(0, rawLength);
}
}

return Base64.encodeBase64String(rawBlockId.getBytes(StandardCharsets.UTF_8));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.XML_TAG_BLOCK_NAME;
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.XML_TAG_COMMITTED_BLOCKS;
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.XML_TAG_HDI_ISFOLDER;
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.XML_TAG_HDI_PERMISSION;
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.XML_TAG_NAME;
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.XML_VERSION;
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.XMS_PROPERTIES_ENCODING_ASCII;
Expand Down Expand Up @@ -898,7 +899,7 @@ public AbfsRestOperation append(final String path,
requestHeaders.add(new AbfsHttpHeader(EXPECT, HUNDRED_CONTINUE));
}
if (isChecksumValidationEnabled()) {
addCheckSumHeaderForWrite(requestHeaders, reqParams, buffer);
addCheckSumHeaderForWrite(requestHeaders, reqParams);
}
if (reqParams.isRetryDueToExpect()) {
String userAgentRetry = getUserAgent();
Expand Down Expand Up @@ -982,6 +983,9 @@ public AbfsRestOperation appendBlock(final String path,
if (requestParameters.getLeaseId() != null) {
requestHeaders.add(new AbfsHttpHeader(X_MS_LEASE_ID, requestParameters.getLeaseId()));
}
if (isChecksumValidationEnabled()) {
addCheckSumHeaderForWrite(requestHeaders, requestParameters);
}
final AbfsUriQueryBuilder abfsUriQueryBuilder = createDefaultUriQueryBuilder();
abfsUriQueryBuilder.addQuery(QUERY_PARAM_COMP, APPEND_BLOCK);
String sasTokenForReuse = appendSASTokenToQuery(path, SASTokenProvider.WRITE_OPERATION, abfsUriQueryBuilder);
Expand Down Expand Up @@ -1021,6 +1025,7 @@ public AbfsRestOperation appendBlock(final String path,
* @param leaseId if there is an active lease on the path.
* @param contextEncryptionAdapter to provide encryption context.
* @param tracingContext for tracing the server calls.
* @param blobMd5 the MD5 hash of the blob for integrity verification.
* @return exception as this operation is not supported on Blob Endpoint.
* @throws UnsupportedOperationException always.
*/
Expand All @@ -1032,7 +1037,7 @@ public AbfsRestOperation flush(final String path,
final String cachedSasToken,
final String leaseId,
final ContextEncryptionAdapter contextEncryptionAdapter,
final TracingContext tracingContext) throws AzureBlobFileSystemException {
final TracingContext tracingContext, String blobMd5) throws AzureBlobFileSystemException {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add new argument in the comments @param. Please make this change wherever required.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

taken

throw new UnsupportedOperationException(
"Flush without blockIds not supported on Blob Endpoint");
}
Expand All @@ -1049,6 +1054,7 @@ public AbfsRestOperation flush(final String path,
* @param eTag The etag of the blob.
* @param contextEncryptionAdapter to provide encryption context.
* @param tracingContext for tracing the service call.
* @param blobMd5 the MD5 hash of the blob for integrity verification.
* @return executed rest operation containing response from server.
* @throws AzureBlobFileSystemException if rest operation fails.
*/
Expand All @@ -1060,7 +1066,7 @@ public AbfsRestOperation flush(byte[] buffer,
final String leaseId,
final String eTag,
ContextEncryptionAdapter contextEncryptionAdapter,
final TracingContext tracingContext) throws AzureBlobFileSystemException {
final TracingContext tracingContext, String blobMd5) throws AzureBlobFileSystemException {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

taken

final List<AbfsHttpHeader> requestHeaders = createDefaultHeaders();
addEncryptionKeyRequestHeaders(path, requestHeaders, false,
contextEncryptionAdapter, tracingContext);
Expand All @@ -1070,9 +1076,9 @@ public AbfsRestOperation flush(byte[] buffer,
if (leaseId != null) {
requestHeaders.add(new AbfsHttpHeader(X_MS_LEASE_ID, leaseId));
}
String md5Hash = computeMD5Hash(buffer, 0, buffer.length);
requestHeaders.add(new AbfsHttpHeader(X_MS_BLOB_CONTENT_MD5, md5Hash));

if (blobMd5 != null) {
requestHeaders.add(new AbfsHttpHeader(X_MS_BLOB_CONTENT_MD5, blobMd5));
}
final AbfsUriQueryBuilder abfsUriQueryBuilder = createDefaultUriQueryBuilder();
abfsUriQueryBuilder.addQuery(QUERY_PARAM_COMP, BLOCKLIST);
abfsUriQueryBuilder.addQuery(QUERY_PARAM_CLOSE, String.valueOf(isClose));
Expand All @@ -1097,7 +1103,7 @@ public AbfsRestOperation flush(byte[] buffer,
AbfsRestOperation op1 = getPathStatus(path, true, tracingContext,
contextEncryptionAdapter);
String metadataMd5 = op1.getResult().getResponseHeader(CONTENT_MD5);
if (!md5Hash.equals(metadataMd5)) {
if (blobMd5 != null && !blobMd5.equals(metadataMd5)) {
throw ex;
}
return op;
Expand Down Expand Up @@ -1914,7 +1920,11 @@ private List<AbfsHttpHeader> getMetadataHeadersList(final Hashtable<String, Stri
// AzureBlobFileSystem supports only ASCII Characters in property values.
if (isPureASCII(value)) {
try {
value = encodeMetadataAttribute(value);
// URL encoding this JSON metadata, set by the WASB Client during file creation, causes compatibility issues.
// Therefore, we need to avoid encoding this metadata.
if (!XML_TAG_HDI_PERMISSION.equalsIgnoreCase(entry.getKey())) {
value = encodeMetadataAttribute(value);
}
} catch (UnsupportedEncodingException e) {
throw new InvalidAbfsRestOperationException(e);
}
Expand Down Expand Up @@ -2057,7 +2067,7 @@ public static String generateBlockListXml(String blockIdString) {

// Split the block ID string by commas and generate XML for each block ID
if (!blockIdString.isEmpty()) {
String[] blockIds = blockIdString.split(",");
String[] blockIds = blockIdString.split(COMMA);
for (String blockId : blockIds) {
stringBuilder.append(String.format(LATEST_BLOCK_FORMAT, blockId));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -879,27 +879,31 @@ public boolean appendSuccessCheckOp(AbfsRestOperation op, final String path,
* @param leaseId if there is an active lease on the path.
* @param contextEncryptionAdapter to provide encryption context.
* @param tracingContext for tracing the server calls.
* @param blobMd5 The Base64-encoded MD5 hash of the blob for data integrity validation.
* @return executed rest operation containing response from server.
* @throws AzureBlobFileSystemException if rest operation fails.
*/
public abstract AbfsRestOperation flush(String path, long position,
boolean retainUncommittedData, boolean isClose,
String cachedSasToken, String leaseId,
ContextEncryptionAdapter contextEncryptionAdapter, TracingContext tracingContext)
ContextEncryptionAdapter contextEncryptionAdapter, TracingContext tracingContext, String blobMd5)
throws AzureBlobFileSystemException;

/**
* Flush previously uploaded data to a file.
* @param buffer containing blockIds to be flushed.
* @param path on which data has to be flushed.
* @param isClose specify if this is the last flush to the file.
* @param cachedSasToken to be used for the authenticating operation.
* @param leaseId if there is an active lease on the path.
* @param eTag to specify conditional headers.
* @param contextEncryptionAdapter to provide encryption context.
* @param tracingContext for tracing the server calls.
* @return executed rest operation containing response from server.
* @throws AzureBlobFileSystemException if rest operation fails.
* Flushes previously uploaded data to the specified path.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT - Format can be consistent across places.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Taken

*
* @param buffer The buffer containing block IDs to be flushed.
* @param path The file path to which data should be flushed.
* @param isClose True if this is the final flush (i.e., the file is being closed).
* @param cachedSasToken SAS token used for authentication (if applicable).
* @param leaseId Lease ID, if a lease is active on the file.
* @param eTag ETag used for conditional request headers (e.g., If-Match).
* @param contextEncryptionAdapter Adapter to provide encryption context, if encryption is enabled.
* @param tracingContext Context for tracing the server calls.
* @param blobMd5 The Base64-encoded MD5 hash of the blob for data integrity validation.
* @return The executed {@link AbfsRestOperation} containing the server response.
*
* @throws AzureBlobFileSystemException if the flush operation fails.
*/
public abstract AbfsRestOperation flush(byte[] buffer,
String path,
Expand All @@ -908,7 +912,7 @@ public abstract AbfsRestOperation flush(byte[] buffer,
String leaseId,
String eTag,
ContextEncryptionAdapter contextEncryptionAdapter,
TracingContext tracingContext) throws AzureBlobFileSystemException;
TracingContext tracingContext, String blobMd5) throws AzureBlobFileSystemException;

/**
* Set the properties of a file or directory.
Expand Down Expand Up @@ -1352,17 +1356,15 @@ private void appendIfNotEmpty(StringBuilder sb, String regEx,

/**
* Add MD5 hash as request header to the append request.
*
* @param requestHeaders to be updated with checksum header
* @param reqParams for getting offset and length
* @param buffer for getting input data for MD5 computation
* @throws AbfsRestOperationException if Md5 computation fails
*/
protected void addCheckSumHeaderForWrite(List<AbfsHttpHeader> requestHeaders,
final AppendRequestParameters reqParams, final byte[] buffer)
throws AbfsRestOperationException {
String md5Hash = computeMD5Hash(buffer, reqParams.getoffset(),
reqParams.getLength());
requestHeaders.add(new AbfsHttpHeader(CONTENT_MD5, md5Hash));
final AppendRequestParameters reqParams) {
if (reqParams.getMd5() != null) {
requestHeaders.add(new AbfsHttpHeader(CONTENT_MD5, reqParams.getMd5()));
}
}

/**
Expand Down
Loading