Add documentation and comments

Martin Foll · Martin Foll · commit 13eef9d04be2 · 2025-10-09T14:10:21.000+02:00
diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx
@@ -33,11 +33,20 @@ namespace TMVA {
 namespace Experimental {
 namespace Internal {
 
+// clang-format off
+/**
+\class ROOT::TMVA::Experimental::Internal::RBatchGenerator
+\ingroup tmva
+\brief 
+
+In this class, the processes of loading chunks (see RChunkLoader) and creating batches from those chunks (see RBatchLoader) are combined, allowing batches from the training and validation sets to be loaded directly from a dataset in an RDataFrame.
+*/
+
 template <typename... Args>
 class RBatchGenerator {
 private:
    std::vector<std::string> fCols;
-
+   // clang-format on
    std::size_t fChunkSize;
    std::size_t fMaxChunks;
    std::size_t fBatchSize;
@@ -128,15 +137,16 @@ public:
 
       fSumVecSizes = std::accumulate(vecSizes.begin(), vecSizes.end(), 0);
       fNumChunkCols = fNumColumns + fSumVecSizes - vecSizes.size();
-      // add the last element in entries to not go out range when filling chunks
+      
+      // add the last element in entries to not go out of range when filling chunks
       fEntries->push_back((*fEntries)[fNumEntries - 1] + 1);
 
       fChunkLoader =
          std::make_unique<RChunkLoader<Args...>>(f_rdf, fNumEntries, fEntries, fChunkSize, fBlockSize, fValidationSplit,
                                                  fCols, vecSizes, vecPadding, fShuffle, fSetSeed);
       fBatchLoader = std::make_unique<RBatchLoader>(fChunkSize, fBatchSize, fNumChunkCols);
 
-      // split the dataset into training and validation
+      // split the dataset into training and validation sets
       fChunkLoader->SplitDataset();
 
       fNumTrainingEntries = fChunkLoader->GetNumTrainingEntries();
@@ -214,6 +224,7 @@ public:
 
    void DeActivateValidationEpoch() { fValidationEpochActive = false; }
 
+   /// \brief Create training batches by first loading a chunk (see RChunkLoader) and split it into batches (see RBatchLoader)
    void CreateTrainBatches()
    {
 
@@ -229,6 +240,7 @@ public:
       fTrainingChunkNum++;
    }
 
+   /// \brief Creates validation batches by first loading a chunk (see RChunkLoader), and then split it into batches (see RBatchLoader)   
    void CreateValidationBatches()
    {
 
@@ -244,10 +256,12 @@ public:
       fValidationChunkNum++;
    }
 
+   /// \brief Loads a training batch from the queue
    TMVA::Experimental::RTensor<float> GetTrainBatch()
    {
       auto batchQueue = fBatchLoader->GetNumTrainingBatchQueue();
 
+      // load the next chunk if the queue is empty
       if (batchQueue < 1 && fTrainingChunkNum < fNumTrainingChunks) {
          fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum);
          std::size_t lastTrainingBatch = fNumTrainingChunks - fTrainingChunkNum;
@@ -264,10 +278,12 @@ public:
       return fBatchLoader->GetTrainBatch();
    }
 
+   /// \brief Loads a validation batch from the queue   
    TMVA::Experimental::RTensor<float> GetValidationBatch()
    {
       auto batchQueue = fBatchLoader->GetNumValidationBatchQueue();
 
+      // load the next chunk if the queue is empty      
       if (batchQueue < 1 && fValidationChunkNum < fNumValidationChunks) {
          fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum);
          std::size_t lastValidationBatch = fNumValidationChunks - fValidationChunkNum;
diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx
@@ -31,8 +31,18 @@ namespace TMVA {
 namespace Experimental {
 namespace Internal {
 
+// clang-format off
+/**
+\class ROOT::TMVA::Experimental::Internal::RBatchLoader
+\ingroup tmva
+\brief Building and loading the batches from loaded chunks in RChunkLoader
+
+In this class the chunks that are loaded into memory (see RChunkLoader) are split into batches used in the ML training which are loaded into a queue. This is done for both the training and validation chunks separatly.
+*/
+
 class RBatchLoader {
 private:
+   // clang-format on      
    std::size_t fChunkSize;
    std::size_t fBatchSize;
    std::size_t fNumColumns;
@@ -45,14 +55,18 @@ private:
    std::mutex fBatchLock;
    std::condition_variable fBatchCondition;
 
+   // queuse of tensors of the training and validation batches
    std::queue<std::unique_ptr<TMVA::Experimental::RTensor<float>>> fTrainingBatchQueue;
    std::queue<std::unique_ptr<TMVA::Experimental::RTensor<float>>> fValidationBatchQueue;
 
+   // number of training and validation batches in the queue
    std::size_t fNumTrainingBatchQueue;
    std::size_t fNumValidationBatchQueue;
 
+   // current batch that is loaded into memeory
    std::unique_ptr<TMVA::Experimental::RTensor<float>> fCurrentBatch;
 
+   // primary and secondary batches used to create batches from a chunk
    std::unique_ptr<TMVA::Experimental::RTensor<float>> fPrimaryLeftoverTrainingBatch;
    std::unique_ptr<TMVA::Experimental::RTensor<float>> fSecondaryLeftoverTrainingBatch;
 
@@ -104,6 +118,8 @@ public:
 
    /// \brief Return a batch of data as a unique pointer.
    /// After the batch has been processed, it should be destroyed.
+   /// \param[in] chunkTensor RTensor with the data from the chunk
+   /// \param[in] idxs Index of batch in the chunk
    /// \return Training batch
    std::unique_ptr<TMVA::Experimental::RTensor<float>>
    CreateBatch(TMVA::Experimental::RTensor<float> &chunkTensor, std::size_t idxs)
@@ -116,6 +132,9 @@ public:
       return batch;
    }
 
+   
+   /// \brief Loading the training batch from the queue
+   /// \return Training batch
    TMVA::Experimental::RTensor<float> GetTrainBatch()
    {
 
@@ -130,6 +149,8 @@ public:
       return *fCurrentBatch;
    }
 
+   /// \brief Loading the validation batch from the queue
+   /// \return Training batch
    TMVA::Experimental::RTensor<float> GetValidationBatch()
    {
 
@@ -144,67 +165,89 @@ public:
       return *fCurrentBatch;
    }
 
+   /// \brief Creating the training batches from a chunk and add them to the queue.
+   /// \param[in] chunkTensor RTensor with the data from the chunk
+   /// \param[in] lastbatch Check if the batch in the chunk is the last one
+   /// \param[in] leftoverBatchSize Size of the leftover batch in the training dataset
+   /// \param[in] dromRemainder Bool to drop the remainder batch or not
    void CreateTrainingBatches(TMVA::Experimental::RTensor<float> &chunkTensor, int lastbatch,
                               std::size_t leftoverBatchSize, bool dropRemainder)
    {
       std::size_t ChunkSize = chunkTensor.GetShape()[0];
       std::size_t Batches = ChunkSize / fBatchSize;
       std::size_t LeftoverBatchSize = ChunkSize % fBatchSize;
 
+      // create a vector of batches
       std::vector<std::unique_ptr<TMVA::Experimental::RTensor<float>>> batches;
 
+      // fill the full batches from the chunk into a vector
       for (std::size_t i = 0; i < Batches; i++) {
          // Fill a batch
          batches.emplace_back(CreateBatch(chunkTensor, i));
       }
 
+      // copy the remaining entries from the chunk into a leftover batch
       TMVA::Experimental::RTensor<float> LeftoverBatch({LeftoverBatchSize, fNumColumns});
       std::copy(chunkTensor.GetData() + (Batches * fBatchSize * fNumColumns),
                 chunkTensor.GetData() + (Batches * fBatchSize * fNumColumns + LeftoverBatchSize * fNumColumns),
                 LeftoverBatch.GetData());
 
+      // calculate how many empty slots are left in fPrimaryLeftoverTrainingBatch
       std::size_t PrimaryLeftoverSize = (*fPrimaryLeftoverTrainingBatch).GetShape()[0];
       std::size_t emptySlots = fBatchSize - PrimaryLeftoverSize;
 
+      // copy LeftoverBatch to end of fPrimaryLeftoverTrainingBatch
       if (emptySlots >= LeftoverBatchSize) {
          (*fPrimaryLeftoverTrainingBatch) =
             (*fPrimaryLeftoverTrainingBatch).Resize({PrimaryLeftoverSize + LeftoverBatchSize, fNumColumns});
          std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (LeftoverBatchSize * fNumColumns),
                    fPrimaryLeftoverTrainingBatch->GetData() + (PrimaryLeftoverSize * fNumColumns));
 
+         // copy LeftoverBatch to end of fPrimaryLeftoverTrainingBatch and add it to the batch vector
          if (emptySlots == LeftoverBatchSize) {
             auto copy =
                std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
             std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
                       fPrimaryLeftoverTrainingBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
             batches.emplace_back(std::move(copy));
 
+            // reset fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
             *fPrimaryLeftoverTrainingBatch = *fSecondaryLeftoverTrainingBatch;
             fSecondaryLeftoverValidationBatch =
                std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
          }
       }
 
+      // copy LeftoverBatch to both fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
       else if (emptySlots < LeftoverBatchSize) {
+         // copy the first part of LeftoverBatch to end of fPrimaryLeftoverTrainingBatch 
          (*fPrimaryLeftoverTrainingBatch) = (*fPrimaryLeftoverTrainingBatch).Resize({fBatchSize, fNumColumns});
          std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (emptySlots * fNumColumns),
                    fPrimaryLeftoverTrainingBatch->GetData() + (PrimaryLeftoverSize * fNumColumns));
 
+         // copy the last part of LeftoverBatch to the end of fSecondaryLeftoverTrainingBatch
          (*fSecondaryLeftoverTrainingBatch) =
             (*fSecondaryLeftoverTrainingBatch).Resize({LeftoverBatchSize - emptySlots, fNumColumns});
          std::copy(LeftoverBatch.GetData() + (emptySlots * fNumColumns),
                    LeftoverBatch.GetData() + (LeftoverBatchSize * fNumColumns),
                    fSecondaryLeftoverTrainingBatch->GetData());
+         
+         // add fPrimaryLeftoverTrainingBatch to the batch vector
          auto copy =
             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
          std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
                    fPrimaryLeftoverTrainingBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
          batches.emplace_back(std::move(copy));
+         
+         // exchange fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverValidationBatch
          *fPrimaryLeftoverTrainingBatch = *fSecondaryLeftoverTrainingBatch;
+         
+         // restet fSecondaryLeftoverValidationBatch
          fSecondaryLeftoverValidationBatch =
             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
       }
 
+      // copy the content of fPrimaryLeftoverTrainingBatch to the leftover batch from the chunk
       if (lastbatch == 1) {
 
          if (dropRemainder == false && leftoverBatchSize > 0) {
@@ -221,11 +264,17 @@ public:
             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
       }
 
+      // append the batches from the batch vector from the chunk to the training batch queue
       for (std::size_t i = 0; i < batches.size(); i++) {
          fTrainingBatchQueue.push(std::move(batches[i]));
       }
    }
-
+   
+   /// \brief Creating the validation batches from a chunk and adding them to the queue
+   /// \param[in] chunkTensor RTensor with the data from the chunk
+   /// \param[in] lastbatch Check if the batch in the chunk is the last one
+   /// \param[in] leftoverBatchSize Size of the leftover batch in the validation dataset
+   /// \param[in] dromRemainder Bool to drop the remainder batch or not
    void CreateValidationBatches(TMVA::Experimental::RTensor<float> &chunkTensor, std::size_t lastbatch,
                                 std::size_t leftoverBatchSize, bool dropRemainder)
    {
diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkConstructor.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RChunkConstructor.hxx
@@ -28,7 +28,33 @@ namespace TMVA {
 namespace Experimental {
 namespace Internal {
 
+// clang-format off
+/**
+\class ROOT::TMVA::Experimental::Internal::RChunkConstructor
+\ingroup tmva
+\brief The logic for constructing chunks from a dataset.
+
+This struct handles the logic for splitting a dataset into smaller subsets 
+known as chunks, which are constructed from blocks.
+ 
+A chunk is the largest portion of the dataset loaded into memory at once, 
+and each chunk is further divided into batches for machine learning training.
+ 
+The dataset is split into disjoint chunks based on a user-defined chunk size.
+There are two types of chunks:
+ - Full chunks: contain exactly the number of entries specified by the chunk size.
+ - Leftover chunk: contains any remaining entries that don't make up a full chunk.
+ 
+Each chunk is constructed from blocks based on a user-defined block size.
+There are two types of blocks:
+ - Full blocks: contain exactly the number of entries specified by the block size.
+ - Leftover block: contains any remaining entries that don't make up a full block.
+
+The blocks are defined by their start and end entries, which correspond to positions within the dataset’s total number of entries.
+*/
+
 struct RChunkConstructor {
+   // clang-format on
    std::size_t fNumEntries;
    std::size_t fChunkSize;
    std::size_t fBlockSize;
@@ -75,6 +101,7 @@ struct RChunkConstructor {
    // total number of blocks
    std::size_t NumberOfBlocks;
 
+   // pair of start and end entries in the different block types
    std::vector<std::pair<Long_t, Long_t>> BlockIntervals = {};
 
    std::vector<std::pair<Long_t, Long_t>> FullBlockIntervalsInFullChunks = {};
@@ -135,6 +162,8 @@ struct RChunkConstructor {
       NumberOfBlocks = std::accumulate(NumberOfDifferentBlocks.begin(), NumberOfDifferentBlocks.end(), 0);
    };
 
+   //////////////////////////////////////////////////////////////////////////
+   /// \brief Group the blocks based on the block type (full or leftover) based on the size of the block.
    void DistributeBlockIntervals()
    {
 
@@ -155,6 +184,8 @@ struct RChunkConstructor {
       }
    }
 
+   //////////////////////////////////////////////////////////////////////////
+   /// \brief Creates chunks from the dataset consisting of blocks with the begin and end entry. 
    void CreateChunksIntervals()
    {
 
@@ -192,6 +223,8 @@ struct RChunkConstructor {
       }
    }
 
+   //////////////////////////////////////////////////////////////////////////
+   /// \brief Fills a vector with the size of every chunk from the dataset 
    void SizeOfChunks()
    {
 
diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RChunkLoader.hxx