mesos · jason-dai · May 31, 2013 · Jun 4, 2013 · rxin · May 31, 2013
diff --git a/core/src/main/scala/spark/BlockStoreShuffleFetcher.scala b/core/src/main/scala/spark/BlockStoreShuffleFetcher.scala
@@ -22,16 +22,19 @@ private[spark] class BlockStoreShuffleFetcher extends ShuffleFetcher with Loggin
     logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format(
       shuffleId, reduceId, System.currentTimeMillis - startTime))
 
-    val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]]
-    for (((address, size), index) <- statuses.zipWithIndex) {
-      splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size))
+    val splitsByAddress = new HashMap[BlockManagerId, HashMap[Int, Long]]
+    for ((address, groupId, size) <- statuses) {
+      val groupedSplits = splitsByAddress.getOrElseUpdate(address, new HashMap[Int, Long])
+      val currSize = groupedSplits.getOrElse(groupId, 0L)
+      if (size > currSize) groupedSplits.put(groupId, size)
     }
 
     val blocksByAddress: Seq[(BlockManagerId, Seq[(String, Long)])] = splitsByAddress.toSeq.map {
       case (address, splits) =>
-        (address, splits.map(s => ("shuffle_%d_%d_%d".format(shuffleId, s._1, reduceId), s._2)))
+        (address, splits.toSeq.map(s => ("shuffle_%d_%d_%d".format(shuffleId, s._1, reduceId), s._2)))
     }
 
+    logDebug("Fetched grouped splits: " + blocksByAddress)
     def unpackBlock(blockPair: (String, Option[Iterator[Any]])) : Iterator[(K, V)] = {
       val blockId = blockPair._1
       val blockOption = blockPair._2

diff --git a/core/src/main/scala/spark/MapOutputTracker.scala b/core/src/main/scala/spark/MapOutputTracker.scala
@@ -117,7 +117,7 @@ private[spark] class MapOutputTracker extends Logging {
   private val fetching = new HashSet[Int]
 
   // Called on possibly remote nodes to get the server URIs and output sizes for a given shuffle
-  def getServerStatuses(shuffleId: Int, reduceId: Int): Array[(BlockManagerId, Long)] = {
+  def getServerStatuses(shuffleId: Int, reduceId: Int): Array[(BlockManagerId, Int, Long)] = {
     val statuses = mapStatuses.get(shuffleId).orNull
     if (statuses == null) {
       logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them")
@@ -280,15 +280,15 @@ private[spark] object MapOutputTracker {
   private def convertMapStatuses(
         shuffleId: Int,
         reduceId: Int,
-        statuses: Array[MapStatus]): Array[(BlockManagerId, Long)] = {
+        statuses: Array[MapStatus]): Array[(BlockManagerId, Int, Long)] = {
     assert (statuses != null)
     statuses.map {
       status => 
         if (status == null) {
           throw new FetchFailedException(null, shuffleId, -1, reduceId,
             new Exception("Missing an output location for shuffle " + shuffleId))
         } else {
-          (status.location, decompressSize(status.compressedSizes(reduceId)))
+          (status.location, status.groupId, decompressSize(status.compressedSizes(reduceId)))
         }
     }
   }

diff --git a/core/src/main/scala/spark/PairRDDFunctions.scala b/core/src/main/scala/spark/PairRDDFunctions.scala
@@ -69,12 +69,14 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
     } else if (mapSideCombine) {
       val mapSideCombined = self.mapPartitions(aggregator.combineValuesByKey(_), true)
       val partitioned = new ShuffledRDD[K, C](mapSideCombined, partitioner, serializerClass)
+      logInfo("serializerClass=" + serializerClass)
       partitioned.mapPartitions(aggregator.combineCombinersByKey(_), true)
     } else {
       // Don't apply map-side combiner.
       // A sanity check to make sure mergeCombiners is not defined.
       assert(mergeCombiners == null)
       val values = new ShuffledRDD[K, V](self, partitioner, serializerClass)
+      logInfo("serializerClass=" + serializerClass)
       values.mapPartitions(aggregator.combineValuesByKey(_), true)
     }
   }

diff --git a/core/src/main/scala/spark/scheduler/MapStatus.scala b/core/src/main/scala/spark/scheduler/MapStatus.scala
@@ -8,19 +8,21 @@ import java.io.{ObjectOutput, ObjectInput, Externalizable}
  * task ran on as well as the sizes of outputs for each reducer, for passing on to the reduce tasks.
  * The map output sizes are compressed using MapOutputTracker.compressSize.
  */
-private[spark] class MapStatus(var location: BlockManagerId, var compressedSizes: Array[Byte])
+private[spark] class MapStatus(var location: BlockManagerId, var groupId: Int, var compressedSizes: Array[Byte])
   extends Externalizable {
 
-  def this() = this(null, null)  // For deserialization only
+  def this() = this(null, 0, null) // For deserialization only
 
   def writeExternal(out: ObjectOutput) {
     location.writeExternal(out)
+    out.writeInt(groupId)
     out.writeInt(compressedSizes.length)
     out.write(compressedSizes)
   }
 
   def readExternal(in: ObjectInput) {
     location = BlockManagerId(in)
+    groupId = in.readInt()
     compressedSizes = new Array[Byte](in.readInt())
     in.readFully(compressedSizes)
   }

diff --git a/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala
@@ -132,48 +132,45 @@ private[spark] class ShuffleMapTask(
 
     val blockManager = SparkEnv.get.blockManager
     var shuffle: ShuffleBlocks = null
-    var buckets: ShuffleWriterGroup = null
+    var group: ShuffleWriterGroup = null
 
     try {
       // Obtain all the block writers for shuffle blocks.
       val ser = SparkEnv.get.serializerManager.get(dep.serializerClass)
       shuffle = blockManager.shuffleBlockManager.forShuffle(dep.shuffleId, numOutputSplits, ser)
-      buckets = shuffle.acquireWriters(partition)
+      group = shuffle.acquireWriters(partition)
 
       // Write the map output to its associated buckets.
       for (elem <- rdd.iterator(split, taskContext)) {
         val pair = elem.asInstanceOf[(Any, Any)]
         val bucketId = dep.partitioner.getPartition(pair._1)
-        buckets.writers(bucketId).write(pair)
+        group.writers(bucketId).write(pair)
       }
 
       // Commit the writes. Get the size of each bucket block (total block size).
       var totalBytes = 0L
-      val compressedSizes: Array[Byte] = buckets.writers.map { writer: BlockObjectWriter =>
-        writer.commit()
-        writer.close()
-        val size = writer.size()
-        totalBytes += size
-        MapOutputTracker.compressSize(size)
+      val compressedSizes: Array[Byte] = group.writers.map { writer: BlockObjectWriter =>
+        totalBytes += writer.commit()
+        MapOutputTracker.compressSize(writer.size())
       }
 
       // Update shuffle metrics.
       val shuffleMetrics = new ShuffleWriteMetrics
       shuffleMetrics.shuffleBytesWritten = totalBytes
       metrics.get.shuffleWriteMetrics = Some(shuffleMetrics)
 
-      return new MapStatus(blockManager.blockManagerId, compressedSizes)
+      return new MapStatus(blockManager.blockManagerId, group.id, compressedSizes)
     } catch { case e: Exception =>
       // If there is an exception from running the task, revert the partial writes
       // and throw the exception upstream to Spark.
-      if (buckets != null) {
-        buckets.writers.foreach(_.revertPartialWrites())
+      if (group != null) {
+        group.writers.foreach(_.revertPartialWrites())
       }
       throw e
     } finally {
       // Release the writers back to the shuffle block manager.
-      if (shuffle != null && buckets != null) {
-        shuffle.releaseWriters(buckets)
+      if (shuffle != null && group != null) {
+        shuffle.releaseWriters(group)
       }
       // Execute the callbacks on task completion.
       taskContext.executeOnCompleteCallbacks()

diff --git a/core/src/main/scala/spark/storage/BlockManager.scala b/core/src/main/scala/spark/storage/BlockManager.scala
@@ -287,6 +287,7 @@ private[spark] class BlockManager(
    * never deletes (recent) items.
    */
   def getLocalFromDisk(blockId: String, serializer: Serializer): Option[Iterator[Any]] = {
+    shuffleBlockManager.closeBlock(blockId)
     diskStore.getValues(blockId, serializer).orElse(
       sys.error("Block " + blockId + " not found on disk, though it should be"))
   }
@@ -382,6 +383,8 @@ private[spark] class BlockManager(
     // As an optimization for map output fetches, if the block is for a shuffle, return it
     // without acquiring a lock; the disk store never deletes (recent) items so this should work
     if (ShuffleBlockManager.isShuffle(blockId)) {
+      //close the shuffle Writers for blockId
+      shuffleBlockManager.closeBlock(blockId)
       return diskStore.getBytes(blockId) match {
         case Some(bytes) =>
           Some(bytes)

diff --git a/core/src/main/scala/spark/storage/DiskStore.scala b/core/src/main/scala/spark/storage/DiskStore.scala
@@ -26,21 +26,27 @@ private class DiskStore(blockManager: BlockManager, rootDirs: String)
   extends BlockStore(blockManager) with Logging {
 
   class DiskBlockObjectWriter(blockId: String, serializer: Serializer, bufferSize: Int)
-    extends BlockObjectWriter(blockId) {
+    extends BlockObjectWriter(blockId) with Logging {
 
-    private val f: File = createFile(blockId /*, allowAppendExisting */)
+    private var f: File = createFile(blockId)
 
     // The file channel, used for repositioning / truncating the file.
     private var channel: FileChannel = null
     private var bs: OutputStream = null
     private var objOut: SerializationStream = null
     private var lastValidPosition = 0L
+    private var initialPosition = 0L
 
     override def open(): DiskBlockObjectWriter = {
       val fos = new FileOutputStream(f, true)
       channel = fos.getChannel()
       bs = blockManager.wrapForCompression(blockId, new FastBufferedOutputStream(fos))
       objOut = serializer.newInstance().serializeStream(bs)
+
+      //commit possible file header
+      commit()
+      initialPosition = lastValidPosition
+
       this
     }
 
@@ -59,7 +65,6 @@ private class DiskStore(blockManager: BlockManager, rootDirs: String)
     // Flush the partial writes, and set valid length to be the length of the entire file.
     // Return the number of bytes written for this commit.
     override def commit(): Long = {
-      // NOTE: Flush the serializer first and then the compressed/buffered output stream
       objOut.flush()
       bs.flush()
       val prevPos = lastValidPosition
@@ -68,11 +73,28 @@ private class DiskStore(blockManager: BlockManager, rootDirs: String)
     }
 
     override def revertPartialWrites() {
+      // Revert by discarding current writes, except that if no values have been committed, 
+      // we revert by recreate the file (otherwise there are errors when reading objects from the file later on  
+      if (lastValidPosition == initialPosition)
+        recerateFile()
+      else 
+        discardWrites()
+    }
+
+    private def recerateFile () {
+      close ()
+      f.delete()
+      f = createFile(blockId)
+      open()
+    }
+
+    private def discardWrites () {
       // Discard current writes. We do this by flushing the outstanding writes and
       // truncate the file to the last valid position.
       objOut.flush()
       bs.flush()
       channel.truncate(lastValidPosition)
+      channel.position(lastValidPosition)
     }
 
     override def write(value: Any) {