Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 5 additions & 23 deletions atgpu/Lattice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ void Lattice::generateGPUKernel() {
// GPU main track function
code.append(kType + "void track(" + mType + "RING_PARAM *ringParam," + mType + "ELEMENT* gpuRing,\n"
" uint32_t startElem,uint32_t nbElementToProcess,\n"
" " + mType + "int32_t *elemOffsets, uint32_t offsetStride,\n"
" uint32_t nbPart," + mType + "AT_FLOAT* rin," + mType + "AT_FLOAT* rout,\n"
" " + mType + "uint32_t* lost, uint32_t turn,\n"
" " + mType + "int32_t *refpts, uint32_t nbRef,\n"
Expand Down Expand Up @@ -228,12 +227,9 @@ void Lattice::generateGPUKernel() {
code.append(" sr6[4] = rin[4 + 6*threadId];\n"); // d = (pz-p0)/p0
code.append(" sr6[5] = rin[5 + 6*threadId];\n"); // c.tau (time lag)

// The tracking starts at elem elemOffset (equivalent to lattice.rotate(elemOffset))
code.append(" int32_t elemOffset = elemOffsets?elemOffsets[threadId/offsetStride]:0;\n");

// Loop over elements
code.append(" while(!pLost && elem<startElem+nbElementToProcess) {\n");
code.append(" "+mType+" ELEMENT* elemPtr = gpuRing + ((elem+elemOffset)%NB_TOTAL_ELEMENT);\n");
code.append(" "+mType+" ELEMENT* elemPtr = gpuRing + elem;\n");
storeParticleCoord(code);
code.append(" switch(elemPtr->Type) {\n");
factory.generatePassMethodsCalls(code);
Expand Down Expand Up @@ -307,7 +303,7 @@ uint64_t Lattice::fillGPUMemory() {
}

void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *rout,uint32_t nbRef,
uint32_t *refPts,uint32_t nbElemOffset,uint32_t *elemOffsets,
uint32_t *refPts,uint32_t startElem,uint32_t endElem,
uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord,
bool updateRin) {

Expand Down Expand Up @@ -371,28 +367,14 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r
if(lostAtElem) gpu->allocDevice(&gpuLostAtElem, nbParticles * sizeof(uint32_t));
if(lostAtCoord) gpu->allocDevice(&gpuLostAtCoord, nbParticles * 6 * sizeof(AT_FLOAT));

// Starting element (lattice rotation)
void *gpuOffsets = nullptr;
uint32_t offsetStride = 0;
if(nbElemOffset) {
gpu->allocDevice(&gpuOffsets, nbElemOffset * sizeof(int32_t));
gpu->hostToDevice(gpuOffsets, elemOffsets, nbElemOffset * sizeof(int32_t));
// nbParticles % nbElemOffset must be 0
// offsetStride (nbParticles/nbElemOffset) should be a multiple of 64
offsetStride = nbParticles / nbElemOffset;
}

// Call GPU
gpu->resetArg();
uint32_t startElem;
uint32_t nbElemToProcess = (uint32_t)elements.size();
uint32_t nbElemToProcess = endElem - startElem;
uint32_t turn;
gpu->addArg(sizeof(void *),&gpuRingParams); // Global ring params
gpu->addArg(sizeof(void *),&gpuRing); // The lattice
gpu->addArg(sizeof(uint32_t),&startElem); // Process from
gpu->addArg(sizeof(uint32_t),&nbElemToProcess); // Number of element to process
gpu->addArg(sizeof(void *),&gpuOffsets); // Tracking start offsets
gpu->addArg(sizeof(uint32_t),&offsetStride); // Number of particle which starts at elem specified in gpuOffsets
gpu->addArg(sizeof(uint32_t),&nbParticles); // Total number of particle to track
gpu->addArg(sizeof(void *),&gpuRin); // Input 6D coordinates
gpu->addArg(sizeof(void *),&gpuRout); // Output 6D coordinates
Expand All @@ -405,8 +387,9 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r

// Turn loop
for(turn=0;turn<nbTurn;turn++) {
startElem = 0;
gpu->run(nbParticles); // 1 particle per thread
startElem = 0;
nbElemToProcess = (uint32_t)elements.size();
}

// Get back data
Expand Down Expand Up @@ -441,7 +424,6 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r
gpu->freeDevice(gpuRingParams);
if( gpuLostAtElem ) gpu->freeDevice(gpuLostAtElem);
if( gpuLostAtCoord ) gpu->freeDevice(gpuLostAtCoord);
if( gpuOffsets ) gpu->freeDevice(gpuOffsets);
delete[] expandedRefPts;
delete[] lost;

Expand Down
2 changes: 1 addition & 1 deletion atgpu/Lattice.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class Lattice {
void generateGPUKernel();
// Run the simulation
void run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *rout,uint32_t nbRef,uint32_t *refPts,
uint32_t nbElemOffset,uint32_t *elemOffsets,uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord,
uint32_t startElem,uint32_t endElem,uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord,
bool updateRin);
// Return handle to the GPU context
GPUContext *getGPUContext();
Expand Down
59 changes: 46 additions & 13 deletions atgpu/PyInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,20 @@ static PyMethodDef AtGPUMethods[] = {
" reuse: if True, use previously cached description of the lattice.\n"
" losses: if True, process losses\n"
" gpu_pool: List of GPU id to use\n"
" tracking_starts: numpy array of indices of elements where tracking should start.\n"
" len(tracking_start) must divide the number of particle and it gives the stride size.\n"
" The i-th particle of rin starts at elem tracking_start[i/stride].\n"
" The behavior is similar to lattice.rotate(tracking_starts[i/stride]).\n"
" The stride size should be multiple of 64 for best performance.\n"
" integrator: Type of integrator to use.\n"
" 1: Euler 1st order, 1 drift/1 kick per step.\n"
" 2: Mclachlan 2nd order, 2 drift/2 kicks per step.\n"
" 3: Ruth 3rd order, 3 drifts/3 kicks per step.\n"
" 4: Forest/Ruth 4th order, 4 drifts/3 kicks per step (Default).\n"
" 5: Optimal 4th order from R. Mclachlan, 4 drifts/4 kicks per step.\n"
" 6: Yoshida 6th order, 8 drifts/7 kicks per step.\n\n"
" 6: Yoshida 6th order, 8 drifts/7 kicks per step.\n"
" start_elem: Start tracking at the specified element. The\n"
" behavior is different from :py:func:`..lattice.rotate`, refpts are\n"
" still indexed from the begininig of the ring and s_coord starts at\n"
" the beginning of the ring. Default 0.\n"
" end_elem: End tracking at the specified element (exclusive).\n"
" When end_elem is different from the number of lattice elements,\n"
" nturns=1 is required. Default number of elements.\n\n"
"Returns:\n"
" rout: 6 x n_particles x n_refpts x n_turns Fortran-ordered numpy array\n"
" of particle coordinates\n\n"
Expand Down Expand Up @@ -138,7 +140,8 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
"energy", "particle", "keep_counter",
"reuse","losses",
"bunch_spos", "bunch_currents", "gpu_pool",
"tracking_starts","integrator","verbose",
"integrator","verbose",
"start_elem","end_elem",
nullptr};

NPY_TYPES floatType;
Expand Down Expand Up @@ -168,10 +171,12 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
int counter=0;
int losses=0;
int integratorType=4;
int startElem=0;
int endElem=-1;
double t0,t1;

// Get input args
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!i|O!$iO!O!pppO!O!O!O!ip", const_cast<char **>(kwlist),
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!i|O!$iO!O!pppO!O!O!ipii", const_cast<char **>(kwlist),
&PyList_Type, &lattice,
&PyArray_Type, &rin,
&num_turns,
Expand All @@ -185,9 +190,10 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
&PyArray_Type, &bspos,
&PyArray_Type, &bcurrents,
&PyList_Type, &gpupool,
&PyArray_Type, &trackstarts,
&integratorType,
&verbose
&verbose,
&startElem,
&endElem
)) {
return nullptr;
}
Expand Down Expand Up @@ -292,6 +298,32 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {

}

int num_elements = gpuLattice->getNbElement();

// Check for partial turn tracking
if(endElem==-1) endElem = num_elements;
if(num_elements!=0) {
if(startElem<0 || startElem>=num_elements) {
char errMsg[64];
sprintf(errMsg,"start_elem out of range, %d not in [0..%d]",startElem,num_elements-1);
return PyErr_Format(PyExc_ValueError, errMsg);
}
if(endElem<0 || endElem>num_elements) {
char errMsg[64];
sprintf(errMsg,"end_elem out of range, %d not in [0..%d]",endElem,num_elements);
return PyErr_Format(PyExc_ValueError, errMsg);
}
if(startElem>endElem) {
return PyErr_Format(PyExc_ValueError, "end_elem must be greater that start_elem");
}
if((endElem!=num_elements) && (num_turns!=1)) {
return PyErr_Format(PyExc_ValueError, "partial turn tracking not ending at the end of the ring requires nturns=1");
}
} else {
startElem = 0;
endElem = 0;
}

// Load lattice on the GPU
try {
uint64_t size = gpuLattice->fillGPUMemory();
Expand All @@ -309,7 +341,8 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
try {

if( verbose )
cout << "Tracking " << num_particles << " particles for " << num_turns << " turns on " << gpuLattice->getGPUContext()->name() << " #" << gpuId << endl;
cout << "Tracking " << num_particles << " particles for " << num_turns << " turns on " <<
gpuLattice->getGPUContext()->name() << " #" << gpuId << " Integrator: #" << integratorType << endl;

npy_intp outdims[4] = {6,(npy_intp)(num_particles),num_refs,num_turns};
PyObject *rout = PyArray_EMPTY(4, outdims, floatType, 1);
Expand All @@ -330,7 +363,7 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
bool *xlostPtr = (bool *)PyArray_DATA((PyArrayObject *)xlost);
AT_FLOAT *xlostcoordPtr = (AT_FLOAT *)PyArray_DATA((PyArrayObject *)xlostcoord);

gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,num_starts,track_starts,
gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,startElem,endElem,
xnturnPtr,xnelemPtr,xlostcoordPtr,true);

// Format result for AT
Expand All @@ -355,7 +388,7 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {

} else {

gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,num_starts,track_starts,
gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,startElem,endElem,
nullptr,nullptr,nullptr,true);
return rout;

Expand Down
12 changes: 4 additions & 8 deletions atgpu/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ void integratorTest(int gpu,string latticeName) {
// Choose an arc close to 1mm where unexpected tune drift is observed when step size in too small (EBS lattice)
AT_FLOAT *rin = createArc(0.001,M_PI/2.0,-M_PI/2.0,nbPart);

l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, nullptr, nullptr, nullptr, nullptr, false);
l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, l->getNbElement()-1, nullptr, nullptr, nullptr, false);

double err = 0;
double max = 0;
Expand Down Expand Up @@ -209,9 +209,6 @@ void performanceTest(int gpu,string latticeName) {
uint32_t nbPart = nbX * nbY;
uint32_t refs[] = {l->getNbElement()};
uint32_t nbRef = sizeof(refs) / sizeof(uint32_t);
uint32_t starts[] = {0,100,200,300,400,500,600,700};
uint32_t nbStride = sizeof(starts) / sizeof(uint32_t);
//uint32_t nbStride = 0;

uint64_t routSize = nbTurn * nbPart * nbRef * 6 * sizeof(AT_FLOAT);
AT_FLOAT *rout = (AT_FLOAT *) malloc(routSize);
Expand All @@ -222,7 +219,7 @@ void performanceTest(int gpu,string latticeName) {
AT_FLOAT *lostAtCoord = new AT_FLOAT[nbPart * 6];

t0 = AbstractGPU::get_ticks();
l->run(nbTurn, nbPart, rin, rout, nbRef, refs, nbStride, starts, lostAtTurn, lostAtElem, lostAtCoord,false);
l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, l->getNbElement()-1, lostAtTurn, lostAtElem, lostAtCoord,false);
t1 = AbstractGPU::get_ticks();

AT_FLOAT *P = &rin[114*6];
Expand All @@ -231,9 +228,8 @@ void performanceTest(int gpu,string latticeName) {
cout << "Lost elem:" << lostAtElem[114] << endl;
cout << "Lost coord:" << lostAtCoord[114*6] << "," << lostAtCoord[114*6+2] << endl;

uint32_t strideSize = nbPart / nbStride;
for(int stride=0; stride < nbStride; stride++) {
P = ROUTPTR(stride*strideSize, 0, nbTurn - 1);
for(int stride=0; stride < 8; stride++) {
P = ROUTPTR(stride*8, 0, nbTurn - 1);
cout << "[" << stride << "] " << P[0] << " " << P[1] << " " << P[2] << " " << P[3] << " " << P[4] << " " << P[5] << endl;
}

Expand Down
Loading
Loading