From 355f691c8022794a8ac3071ab8a89590b06ceae5 Mon Sep 17 00:00:00 2001 From: yl09099 Date: Thu, 19 Dec 2024 16:52:05 +0800 Subject: [PATCH] [Bug] Long tail tasks in the Write Stage retry phase results in data loss. --- .../uniffle/shuffle/manager/ShuffleManagerGrpcService.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/client-spark/common/src/main/java/org/apache/uniffle/shuffle/manager/ShuffleManagerGrpcService.java b/client-spark/common/src/main/java/org/apache/uniffle/shuffle/manager/ShuffleManagerGrpcService.java index 667b6a9056..349b822bb7 100644 --- a/client-spark/common/src/main/java/org/apache/uniffle/shuffle/manager/ShuffleManagerGrpcService.java +++ b/client-spark/common/src/main/java/org/apache/uniffle/shuffle/manager/ShuffleManagerGrpcService.java @@ -118,6 +118,9 @@ public void reportShuffleWriteFailure( // Clear the metadata of the completed task, otherwise some of the stage's data will // be lost. shuffleManager.unregisterAllMapOutput(shuffleId); + // Need to clear the mapStatus twice to prevent partition data loss due to the + // long-tail task performed before the stage retry. + shuffleManager.unregisterAllMapOutput(shuffleId); // Deregister the shuffleId corresponding to the Shuffle Server. shuffleManager.getShuffleWriteClient().unregisterShuffle(appId, shuffleId); shuffleServerWriterFailureRecord.setClearedMapTrackerBlock(true);