Skip to content

Commit 84657e2

Browse files
authored
Added integrator flag (GPU), Optimized projection in acceptance boundary search (#969)
* Added kwargs to get_acceptance() calls to allow specific GPU parameters to be passed * Added trace * Optimized projection for boundary search (GridMode RADIAL or CARTESIAN) * Fix for empty lattice * Fix wrong memory freeing * Run black * end_elem initialized to 0 * Updated doc * Updated stubs * Added unit test * Updated doc * Added unit test for errors * Updated doc
1 parent e053163 commit 84657e2

File tree

11 files changed

+221
-116
lines changed

11 files changed

+221
-116
lines changed

atgpu/Lattice.cpp

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,6 @@ void Lattice::generateGPUKernel() {
200200
// GPU main track function
201201
code.append(kType + "void track(" + mType + "RING_PARAM *ringParam," + mType + "ELEMENT* gpuRing,\n"
202202
" uint32_t startElem,uint32_t nbElementToProcess,\n"
203-
" " + mType + "int32_t *elemOffsets, uint32_t offsetStride,\n"
204203
" uint32_t nbPart," + mType + "AT_FLOAT* rin," + mType + "AT_FLOAT* rout,\n"
205204
" " + mType + "uint32_t* lost, uint32_t turn,\n"
206205
" " + mType + "int32_t *refpts, uint32_t nbRef,\n"
@@ -228,12 +227,9 @@ void Lattice::generateGPUKernel() {
228227
code.append(" sr6[4] = rin[4 + 6*threadId];\n"); // d = (pz-p0)/p0
229228
code.append(" sr6[5] = rin[5 + 6*threadId];\n"); // c.tau (time lag)
230229

231-
// The tracking starts at elem elemOffset (equivalent to lattice.rotate(elemOffset))
232-
code.append(" int32_t elemOffset = elemOffsets?elemOffsets[threadId/offsetStride]:0;\n");
233-
234230
// Loop over elements
235231
code.append(" while(!pLost && elem<startElem+nbElementToProcess) {\n");
236-
code.append(" "+mType+" ELEMENT* elemPtr = gpuRing + ((elem+elemOffset)%NB_TOTAL_ELEMENT);\n");
232+
code.append(" "+mType+" ELEMENT* elemPtr = gpuRing + elem;\n");
237233
storeParticleCoord(code);
238234
code.append(" switch(elemPtr->Type) {\n");
239235
factory.generatePassMethodsCalls(code);
@@ -307,7 +303,7 @@ uint64_t Lattice::fillGPUMemory() {
307303
}
308304

309305
void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *rout,uint32_t nbRef,
310-
uint32_t *refPts,uint32_t nbElemOffset,uint32_t *elemOffsets,
306+
uint32_t *refPts,uint32_t startElem,uint32_t endElem,
311307
uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord,
312308
bool updateRin) {
313309

@@ -371,28 +367,14 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r
371367
if(lostAtElem) gpu->allocDevice(&gpuLostAtElem, nbParticles * sizeof(uint32_t));
372368
if(lostAtCoord) gpu->allocDevice(&gpuLostAtCoord, nbParticles * 6 * sizeof(AT_FLOAT));
373369

374-
// Starting element (lattice rotation)
375-
void *gpuOffsets = nullptr;
376-
uint32_t offsetStride = 0;
377-
if(nbElemOffset) {
378-
gpu->allocDevice(&gpuOffsets, nbElemOffset * sizeof(int32_t));
379-
gpu->hostToDevice(gpuOffsets, elemOffsets, nbElemOffset * sizeof(int32_t));
380-
// nbParticles % nbElemOffset must be 0
381-
// offsetStride (nbParticles/nbElemOffset) should be a multiple of 64
382-
offsetStride = nbParticles / nbElemOffset;
383-
}
384-
385370
// Call GPU
386371
gpu->resetArg();
387-
uint32_t startElem;
388-
uint32_t nbElemToProcess = (uint32_t)elements.size();
372+
uint32_t nbElemToProcess = endElem - startElem;
389373
uint32_t turn;
390374
gpu->addArg(sizeof(void *),&gpuRingParams); // Global ring params
391375
gpu->addArg(sizeof(void *),&gpuRing); // The lattice
392376
gpu->addArg(sizeof(uint32_t),&startElem); // Process from
393377
gpu->addArg(sizeof(uint32_t),&nbElemToProcess); // Number of element to process
394-
gpu->addArg(sizeof(void *),&gpuOffsets); // Tracking start offsets
395-
gpu->addArg(sizeof(uint32_t),&offsetStride); // Number of particle which starts at elem specified in gpuOffsets
396378
gpu->addArg(sizeof(uint32_t),&nbParticles); // Total number of particle to track
397379
gpu->addArg(sizeof(void *),&gpuRin); // Input 6D coordinates
398380
gpu->addArg(sizeof(void *),&gpuRout); // Output 6D coordinates
@@ -405,8 +387,9 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r
405387

406388
// Turn loop
407389
for(turn=0;turn<nbTurn;turn++) {
408-
startElem = 0;
409390
gpu->run(nbParticles); // 1 particle per thread
391+
startElem = 0;
392+
nbElemToProcess = (uint32_t)elements.size();
410393
}
411394

412395
// Get back data
@@ -441,7 +424,6 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r
441424
gpu->freeDevice(gpuRingParams);
442425
if( gpuLostAtElem ) gpu->freeDevice(gpuLostAtElem);
443426
if( gpuLostAtCoord ) gpu->freeDevice(gpuLostAtCoord);
444-
if( gpuOffsets ) gpu->freeDevice(gpuOffsets);
445427
delete[] expandedRefPts;
446428
delete[] lost;
447429

atgpu/Lattice.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class Lattice {
2727
void generateGPUKernel();
2828
// Run the simulation
2929
void run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *rout,uint32_t nbRef,uint32_t *refPts,
30-
uint32_t nbElemOffset,uint32_t *elemOffsets,uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord,
30+
uint32_t startElem,uint32_t endElem,uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord,
3131
bool updateRin);
3232
// Return handle to the GPU context
3333
GPUContext *getGPUContext();

atgpu/PyInterface.cpp

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,20 @@ static PyMethodDef AtGPUMethods[] = {
4141
" reuse: if True, use previously cached description of the lattice.\n"
4242
" losses: if True, process losses\n"
4343
" gpu_pool: List of GPU id to use\n"
44-
" tracking_starts: numpy array of indices of elements where tracking should start.\n"
45-
" len(tracking_start) must divide the number of particle and it gives the stride size.\n"
46-
" The i-th particle of rin starts at elem tracking_start[i/stride].\n"
47-
" The behavior is similar to lattice.rotate(tracking_starts[i/stride]).\n"
48-
" The stride size should be multiple of 64 for best performance.\n"
4944
" integrator: Type of integrator to use.\n"
5045
" 1: Euler 1st order, 1 drift/1 kick per step.\n"
5146
" 2: Mclachlan 2nd order, 2 drift/2 kicks per step.\n"
5247
" 3: Ruth 3rd order, 3 drifts/3 kicks per step.\n"
5348
" 4: Forest/Ruth 4th order, 4 drifts/3 kicks per step (Default).\n"
5449
" 5: Optimal 4th order from R. Mclachlan, 4 drifts/4 kicks per step.\n"
55-
" 6: Yoshida 6th order, 8 drifts/7 kicks per step.\n\n"
50+
" 6: Yoshida 6th order, 8 drifts/7 kicks per step.\n"
51+
" start_elem: Start tracking at the specified element. The\n"
52+
" behavior is different from :py:func:`..lattice.rotate`, refpts are\n"
53+
" still indexed from the begininig of the ring and s_coord starts at\n"
54+
" the beginning of the ring. Default 0.\n"
55+
" end_elem: End tracking at the specified element (exclusive).\n"
56+
" When end_elem is different from the number of lattice elements,\n"
57+
" nturns=1 is required. Default number of elements.\n\n"
5658
"Returns:\n"
5759
" rout: 6 x n_particles x n_refpts x n_turns Fortran-ordered numpy array\n"
5860
" of particle coordinates\n\n"
@@ -138,7 +140,8 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
138140
"energy", "particle", "keep_counter",
139141
"reuse","losses",
140142
"bunch_spos", "bunch_currents", "gpu_pool",
141-
"tracking_starts","integrator","verbose",
143+
"integrator","verbose",
144+
"start_elem","end_elem",
142145
nullptr};
143146

144147
NPY_TYPES floatType;
@@ -168,10 +171,12 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
168171
int counter=0;
169172
int losses=0;
170173
int integratorType=4;
174+
int startElem=0;
175+
int endElem=-1;
171176
double t0,t1;
172177

173178
// Get input args
174-
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!i|O!$iO!O!pppO!O!O!O!ip", const_cast<char **>(kwlist),
179+
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!i|O!$iO!O!pppO!O!O!ipii", const_cast<char **>(kwlist),
175180
&PyList_Type, &lattice,
176181
&PyArray_Type, &rin,
177182
&num_turns,
@@ -185,9 +190,10 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
185190
&PyArray_Type, &bspos,
186191
&PyArray_Type, &bcurrents,
187192
&PyList_Type, &gpupool,
188-
&PyArray_Type, &trackstarts,
189193
&integratorType,
190-
&verbose
194+
&verbose,
195+
&startElem,
196+
&endElem
191197
)) {
192198
return nullptr;
193199
}
@@ -292,6 +298,32 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
292298

293299
}
294300

301+
int num_elements = gpuLattice->getNbElement();
302+
303+
// Check for partial turn tracking
304+
if(endElem==-1) endElem = num_elements;
305+
if(num_elements!=0) {
306+
if(startElem<0 || startElem>=num_elements) {
307+
char errMsg[64];
308+
sprintf(errMsg,"start_elem out of range, %d not in [0..%d]",startElem,num_elements-1);
309+
return PyErr_Format(PyExc_ValueError, errMsg);
310+
}
311+
if(endElem<0 || endElem>num_elements) {
312+
char errMsg[64];
313+
sprintf(errMsg,"end_elem out of range, %d not in [0..%d]",endElem,num_elements);
314+
return PyErr_Format(PyExc_ValueError, errMsg);
315+
}
316+
if(startElem>endElem) {
317+
return PyErr_Format(PyExc_ValueError, "end_elem must be greater that start_elem");
318+
}
319+
if((endElem!=num_elements) && (num_turns!=1)) {
320+
return PyErr_Format(PyExc_ValueError, "partial turn tracking not ending at the end of the ring requires nturns=1");
321+
}
322+
} else {
323+
startElem = 0;
324+
endElem = 0;
325+
}
326+
295327
// Load lattice on the GPU
296328
try {
297329
uint64_t size = gpuLattice->fillGPUMemory();
@@ -309,7 +341,8 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
309341
try {
310342

311343
if( verbose )
312-
cout << "Tracking " << num_particles << " particles for " << num_turns << " turns on " << gpuLattice->getGPUContext()->name() << " #" << gpuId << endl;
344+
cout << "Tracking " << num_particles << " particles for " << num_turns << " turns on " <<
345+
gpuLattice->getGPUContext()->name() << " #" << gpuId << " Integrator: #" << integratorType << endl;
313346

314347
npy_intp outdims[4] = {6,(npy_intp)(num_particles),num_refs,num_turns};
315348
PyObject *rout = PyArray_EMPTY(4, outdims, floatType, 1);
@@ -330,7 +363,7 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
330363
bool *xlostPtr = (bool *)PyArray_DATA((PyArrayObject *)xlost);
331364
AT_FLOAT *xlostcoordPtr = (AT_FLOAT *)PyArray_DATA((PyArrayObject *)xlostcoord);
332365

333-
gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,num_starts,track_starts,
366+
gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,startElem,endElem,
334367
xnturnPtr,xnelemPtr,xlostcoordPtr,true);
335368

336369
// Format result for AT
@@ -355,7 +388,7 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
355388

356389
} else {
357390

358-
gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,num_starts,track_starts,
391+
gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,startElem,endElem,
359392
nullptr,nullptr,nullptr,true);
360393
return rout;
361394

atgpu/main.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ void integratorTest(int gpu,string latticeName) {
134134
// Choose an arc close to 1mm where unexpected tune drift is observed when step size in too small (EBS lattice)
135135
AT_FLOAT *rin = createArc(0.001,M_PI/2.0,-M_PI/2.0,nbPart);
136136

137-
l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, nullptr, nullptr, nullptr, nullptr, false);
137+
l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, l->getNbElement()-1, nullptr, nullptr, nullptr, false);
138138

139139
double err = 0;
140140
double max = 0;
@@ -209,9 +209,6 @@ void performanceTest(int gpu,string latticeName) {
209209
uint32_t nbPart = nbX * nbY;
210210
uint32_t refs[] = {l->getNbElement()};
211211
uint32_t nbRef = sizeof(refs) / sizeof(uint32_t);
212-
uint32_t starts[] = {0,100,200,300,400,500,600,700};
213-
uint32_t nbStride = sizeof(starts) / sizeof(uint32_t);
214-
//uint32_t nbStride = 0;
215212

216213
uint64_t routSize = nbTurn * nbPart * nbRef * 6 * sizeof(AT_FLOAT);
217214
AT_FLOAT *rout = (AT_FLOAT *) malloc(routSize);
@@ -222,7 +219,7 @@ void performanceTest(int gpu,string latticeName) {
222219
AT_FLOAT *lostAtCoord = new AT_FLOAT[nbPart * 6];
223220

224221
t0 = AbstractGPU::get_ticks();
225-
l->run(nbTurn, nbPart, rin, rout, nbRef, refs, nbStride, starts, lostAtTurn, lostAtElem, lostAtCoord,false);
222+
l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, l->getNbElement()-1, lostAtTurn, lostAtElem, lostAtCoord,false);
226223
t1 = AbstractGPU::get_ticks();
227224

228225
AT_FLOAT *P = &rin[114*6];
@@ -231,9 +228,8 @@ void performanceTest(int gpu,string latticeName) {
231228
cout << "Lost elem:" << lostAtElem[114] << endl;
232229
cout << "Lost coord:" << lostAtCoord[114*6] << "," << lostAtCoord[114*6+2] << endl;
233230

234-
uint32_t strideSize = nbPart / nbStride;
235-
for(int stride=0; stride < nbStride; stride++) {
236-
P = ROUTPTR(stride*strideSize, 0, nbTurn - 1);
231+
for(int stride=0; stride < 8; stride++) {
232+
P = ROUTPTR(stride*8, 0, nbTurn - 1);
237233
cout << "[" << stride << "] " << P[0] << " " << P[1] << " " << P[2] << " " << P[3] << " " << P[4] << " " << P[5] << endl;
238234
}
239235

0 commit comments

Comments
 (0)