Skip to content

Commit 13eef9d

Browse files
author
Martin Foll
committed
Add documentation and comments
1 parent 18936f7 commit 13eef9d

File tree

4 files changed

+163
-6
lines changed

4 files changed

+163
-6
lines changed

tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,20 @@ namespace TMVA {
3333
namespace Experimental {
3434
namespace Internal {
3535

36+
// clang-format off
37+
/**
38+
\class ROOT::TMVA::Experimental::Internal::RBatchGenerator
39+
\ingroup tmva
40+
\brief
41+
42+
In this class, the processes of loading chunks (see RChunkLoader) and creating batches from those chunks (see RBatchLoader) are combined, allowing batches from the training and validation sets to be loaded directly from a dataset in an RDataFrame.
43+
*/
44+
3645
template <typename... Args>
3746
class RBatchGenerator {
3847
private:
3948
std::vector<std::string> fCols;
40-
49+
// clang-format on
4150
std::size_t fChunkSize;
4251
std::size_t fMaxChunks;
4352
std::size_t fBatchSize;
@@ -128,15 +137,16 @@ public:
128137

129138
fSumVecSizes = std::accumulate(vecSizes.begin(), vecSizes.end(), 0);
130139
fNumChunkCols = fNumColumns + fSumVecSizes - vecSizes.size();
131-
// add the last element in entries to not go out range when filling chunks
140+
141+
// add the last element in entries to not go out of range when filling chunks
132142
fEntries->push_back((*fEntries)[fNumEntries - 1] + 1);
133143

134144
fChunkLoader =
135145
std::make_unique<RChunkLoader<Args...>>(f_rdf, fNumEntries, fEntries, fChunkSize, fBlockSize, fValidationSplit,
136146
fCols, vecSizes, vecPadding, fShuffle, fSetSeed);
137147
fBatchLoader = std::make_unique<RBatchLoader>(fChunkSize, fBatchSize, fNumChunkCols);
138148

139-
// split the dataset into training and validation
149+
// split the dataset into training and validation sets
140150
fChunkLoader->SplitDataset();
141151

142152
fNumTrainingEntries = fChunkLoader->GetNumTrainingEntries();
@@ -214,6 +224,7 @@ public:
214224

215225
void DeActivateValidationEpoch() { fValidationEpochActive = false; }
216226

227+
/// \brief Create training batches by first loading a chunk (see RChunkLoader) and split it into batches (see RBatchLoader)
217228
void CreateTrainBatches()
218229
{
219230

@@ -229,6 +240,7 @@ public:
229240
fTrainingChunkNum++;
230241
}
231242

243+
/// \brief Creates validation batches by first loading a chunk (see RChunkLoader), and then split it into batches (see RBatchLoader)
232244
void CreateValidationBatches()
233245
{
234246

@@ -244,10 +256,12 @@ public:
244256
fValidationChunkNum++;
245257
}
246258

259+
/// \brief Loads a training batch from the queue
247260
TMVA::Experimental::RTensor<float> GetTrainBatch()
248261
{
249262
auto batchQueue = fBatchLoader->GetNumTrainingBatchQueue();
250263

264+
// load the next chunk if the queue is empty
251265
if (batchQueue < 1 && fTrainingChunkNum < fNumTrainingChunks) {
252266
fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum);
253267
std::size_t lastTrainingBatch = fNumTrainingChunks - fTrainingChunkNum;
@@ -264,10 +278,12 @@ public:
264278
return fBatchLoader->GetTrainBatch();
265279
}
266280

281+
/// \brief Loads a validation batch from the queue
267282
TMVA::Experimental::RTensor<float> GetValidationBatch()
268283
{
269284
auto batchQueue = fBatchLoader->GetNumValidationBatchQueue();
270285

286+
// load the next chunk if the queue is empty
271287
if (batchQueue < 1 && fValidationChunkNum < fNumValidationChunks) {
272288
fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum);
273289
std::size_t lastValidationBatch = fNumValidationChunks - fValidationChunkNum;

tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,18 @@ namespace TMVA {
3131
namespace Experimental {
3232
namespace Internal {
3333

34+
// clang-format off
35+
/**
36+
\class ROOT::TMVA::Experimental::Internal::RBatchLoader
37+
\ingroup tmva
38+
\brief Building and loading the batches from loaded chunks in RChunkLoader
39+
40+
In this class the chunks that are loaded into memory (see RChunkLoader) are split into batches used in the ML training which are loaded into a queue. This is done for both the training and validation chunks separatly.
41+
*/
42+
3443
class RBatchLoader {
3544
private:
45+
// clang-format on
3646
std::size_t fChunkSize;
3747
std::size_t fBatchSize;
3848
std::size_t fNumColumns;
@@ -45,14 +55,18 @@ private:
4555
std::mutex fBatchLock;
4656
std::condition_variable fBatchCondition;
4757

58+
// queuse of tensors of the training and validation batches
4859
std::queue<std::unique_ptr<TMVA::Experimental::RTensor<float>>> fTrainingBatchQueue;
4960
std::queue<std::unique_ptr<TMVA::Experimental::RTensor<float>>> fValidationBatchQueue;
5061

62+
// number of training and validation batches in the queue
5163
std::size_t fNumTrainingBatchQueue;
5264
std::size_t fNumValidationBatchQueue;
5365

66+
// current batch that is loaded into memeory
5467
std::unique_ptr<TMVA::Experimental::RTensor<float>> fCurrentBatch;
5568

69+
// primary and secondary batches used to create batches from a chunk
5670
std::unique_ptr<TMVA::Experimental::RTensor<float>> fPrimaryLeftoverTrainingBatch;
5771
std::unique_ptr<TMVA::Experimental::RTensor<float>> fSecondaryLeftoverTrainingBatch;
5872

@@ -104,6 +118,8 @@ public:
104118

105119
/// \brief Return a batch of data as a unique pointer.
106120
/// After the batch has been processed, it should be destroyed.
121+
/// \param[in] chunkTensor RTensor with the data from the chunk
122+
/// \param[in] idxs Index of batch in the chunk
107123
/// \return Training batch
108124
std::unique_ptr<TMVA::Experimental::RTensor<float>>
109125
CreateBatch(TMVA::Experimental::RTensor<float> &chunkTensor, std::size_t idxs)
@@ -116,6 +132,9 @@ public:
116132
return batch;
117133
}
118134

135+
136+
/// \brief Loading the training batch from the queue
137+
/// \return Training batch
119138
TMVA::Experimental::RTensor<float> GetTrainBatch()
120139
{
121140

@@ -130,6 +149,8 @@ public:
130149
return *fCurrentBatch;
131150
}
132151

152+
/// \brief Loading the validation batch from the queue
153+
/// \return Training batch
133154
TMVA::Experimental::RTensor<float> GetValidationBatch()
134155
{
135156

@@ -144,67 +165,89 @@ public:
144165
return *fCurrentBatch;
145166
}
146167

168+
/// \brief Creating the training batches from a chunk and add them to the queue.
169+
/// \param[in] chunkTensor RTensor with the data from the chunk
170+
/// \param[in] lastbatch Check if the batch in the chunk is the last one
171+
/// \param[in] leftoverBatchSize Size of the leftover batch in the training dataset
172+
/// \param[in] dromRemainder Bool to drop the remainder batch or not
147173
void CreateTrainingBatches(TMVA::Experimental::RTensor<float> &chunkTensor, int lastbatch,
148174
std::size_t leftoverBatchSize, bool dropRemainder)
149175
{
150176
std::size_t ChunkSize = chunkTensor.GetShape()[0];
151177
std::size_t Batches = ChunkSize / fBatchSize;
152178
std::size_t LeftoverBatchSize = ChunkSize % fBatchSize;
153179

180+
// create a vector of batches
154181
std::vector<std::unique_ptr<TMVA::Experimental::RTensor<float>>> batches;
155182

183+
// fill the full batches from the chunk into a vector
156184
for (std::size_t i = 0; i < Batches; i++) {
157185
// Fill a batch
158186
batches.emplace_back(CreateBatch(chunkTensor, i));
159187
}
160188

189+
// copy the remaining entries from the chunk into a leftover batch
161190
TMVA::Experimental::RTensor<float> LeftoverBatch({LeftoverBatchSize, fNumColumns});
162191
std::copy(chunkTensor.GetData() + (Batches * fBatchSize * fNumColumns),
163192
chunkTensor.GetData() + (Batches * fBatchSize * fNumColumns + LeftoverBatchSize * fNumColumns),
164193
LeftoverBatch.GetData());
165194

195+
// calculate how many empty slots are left in fPrimaryLeftoverTrainingBatch
166196
std::size_t PrimaryLeftoverSize = (*fPrimaryLeftoverTrainingBatch).GetShape()[0];
167197
std::size_t emptySlots = fBatchSize - PrimaryLeftoverSize;
168198

199+
// copy LeftoverBatch to end of fPrimaryLeftoverTrainingBatch
169200
if (emptySlots >= LeftoverBatchSize) {
170201
(*fPrimaryLeftoverTrainingBatch) =
171202
(*fPrimaryLeftoverTrainingBatch).Resize({PrimaryLeftoverSize + LeftoverBatchSize, fNumColumns});
172203
std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (LeftoverBatchSize * fNumColumns),
173204
fPrimaryLeftoverTrainingBatch->GetData() + (PrimaryLeftoverSize * fNumColumns));
174205

206+
// copy LeftoverBatch to end of fPrimaryLeftoverTrainingBatch and add it to the batch vector
175207
if (emptySlots == LeftoverBatchSize) {
176208
auto copy =
177209
std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
178210
std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
179211
fPrimaryLeftoverTrainingBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
180212
batches.emplace_back(std::move(copy));
181213

214+
// reset fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
182215
*fPrimaryLeftoverTrainingBatch = *fSecondaryLeftoverTrainingBatch;
183216
fSecondaryLeftoverValidationBatch =
184217
std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
185218
}
186219
}
187220

221+
// copy LeftoverBatch to both fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
188222
else if (emptySlots < LeftoverBatchSize) {
223+
// copy the first part of LeftoverBatch to end of fPrimaryLeftoverTrainingBatch
189224
(*fPrimaryLeftoverTrainingBatch) = (*fPrimaryLeftoverTrainingBatch).Resize({fBatchSize, fNumColumns});
190225
std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (emptySlots * fNumColumns),
191226
fPrimaryLeftoverTrainingBatch->GetData() + (PrimaryLeftoverSize * fNumColumns));
192227

228+
// copy the last part of LeftoverBatch to the end of fSecondaryLeftoverTrainingBatch
193229
(*fSecondaryLeftoverTrainingBatch) =
194230
(*fSecondaryLeftoverTrainingBatch).Resize({LeftoverBatchSize - emptySlots, fNumColumns});
195231
std::copy(LeftoverBatch.GetData() + (emptySlots * fNumColumns),
196232
LeftoverBatch.GetData() + (LeftoverBatchSize * fNumColumns),
197233
fSecondaryLeftoverTrainingBatch->GetData());
234+
235+
// add fPrimaryLeftoverTrainingBatch to the batch vector
198236
auto copy =
199237
std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
200238
std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
201239
fPrimaryLeftoverTrainingBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
202240
batches.emplace_back(std::move(copy));
241+
242+
// exchange fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverValidationBatch
203243
*fPrimaryLeftoverTrainingBatch = *fSecondaryLeftoverTrainingBatch;
244+
245+
// restet fSecondaryLeftoverValidationBatch
204246
fSecondaryLeftoverValidationBatch =
205247
std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
206248
}
207249

250+
// copy the content of fPrimaryLeftoverTrainingBatch to the leftover batch from the chunk
208251
if (lastbatch == 1) {
209252

210253
if (dropRemainder == false && leftoverBatchSize > 0) {
@@ -221,11 +264,17 @@ public:
221264
std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
222265
}
223266

267+
// append the batches from the batch vector from the chunk to the training batch queue
224268
for (std::size_t i = 0; i < batches.size(); i++) {
225269
fTrainingBatchQueue.push(std::move(batches[i]));
226270
}
227271
}
228-
272+
273+
/// \brief Creating the validation batches from a chunk and adding them to the queue
274+
/// \param[in] chunkTensor RTensor with the data from the chunk
275+
/// \param[in] lastbatch Check if the batch in the chunk is the last one
276+
/// \param[in] leftoverBatchSize Size of the leftover batch in the validation dataset
277+
/// \param[in] dromRemainder Bool to drop the remainder batch or not
229278
void CreateValidationBatches(TMVA::Experimental::RTensor<float> &chunkTensor, std::size_t lastbatch,
230279
std::size_t leftoverBatchSize, bool dropRemainder)
231280
{

tmva/tmva/inc/TMVA/BatchGenerator/RChunkConstructor.hxx

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,33 @@ namespace TMVA {
2828
namespace Experimental {
2929
namespace Internal {
3030

31+
// clang-format off
32+
/**
33+
\class ROOT::TMVA::Experimental::Internal::RChunkConstructor
34+
\ingroup tmva
35+
\brief The logic for constructing chunks from a dataset.
36+
37+
This struct handles the logic for splitting a dataset into smaller subsets
38+
known as chunks, which are constructed from blocks.
39+
40+
A chunk is the largest portion of the dataset loaded into memory at once,
41+
and each chunk is further divided into batches for machine learning training.
42+
43+
The dataset is split into disjoint chunks based on a user-defined chunk size.
44+
There are two types of chunks:
45+
- Full chunks: contain exactly the number of entries specified by the chunk size.
46+
- Leftover chunk: contains any remaining entries that don't make up a full chunk.
47+
48+
Each chunk is constructed from blocks based on a user-defined block size.
49+
There are two types of blocks:
50+
- Full blocks: contain exactly the number of entries specified by the block size.
51+
- Leftover block: contains any remaining entries that don't make up a full block.
52+
53+
The blocks are defined by their start and end entries, which correspond to positions within the dataset’s total number of entries.
54+
*/
55+
3156
struct RChunkConstructor {
57+
// clang-format on
3258
std::size_t fNumEntries;
3359
std::size_t fChunkSize;
3460
std::size_t fBlockSize;
@@ -75,6 +101,7 @@ struct RChunkConstructor {
75101
// total number of blocks
76102
std::size_t NumberOfBlocks;
77103

104+
// pair of start and end entries in the different block types
78105
std::vector<std::pair<Long_t, Long_t>> BlockIntervals = {};
79106

80107
std::vector<std::pair<Long_t, Long_t>> FullBlockIntervalsInFullChunks = {};
@@ -135,6 +162,8 @@ struct RChunkConstructor {
135162
NumberOfBlocks = std::accumulate(NumberOfDifferentBlocks.begin(), NumberOfDifferentBlocks.end(), 0);
136163
};
137164

165+
//////////////////////////////////////////////////////////////////////////
166+
/// \brief Group the blocks based on the block type (full or leftover) based on the size of the block.
138167
void DistributeBlockIntervals()
139168
{
140169

@@ -155,6 +184,8 @@ struct RChunkConstructor {
155184
}
156185
}
157186

187+
//////////////////////////////////////////////////////////////////////////
188+
/// \brief Creates chunks from the dataset consisting of blocks with the begin and end entry.
158189
void CreateChunksIntervals()
159190
{
160191

@@ -192,6 +223,8 @@ struct RChunkConstructor {
192223
}
193224
}
194225

226+
//////////////////////////////////////////////////////////////////////////
227+
/// \brief Fills a vector with the size of every chunk from the dataset
195228
void SizeOfChunks()
196229
{
197230

0 commit comments

Comments
 (0)