diff --git a/atgpu/Lattice.cpp b/atgpu/Lattice.cpp index 3e63fc815..d01baf7fe 100755 --- a/atgpu/Lattice.cpp +++ b/atgpu/Lattice.cpp @@ -200,7 +200,6 @@ void Lattice::generateGPUKernel() { // GPU main track function code.append(kType + "void track(" + mType + "RING_PARAM *ringParam," + mType + "ELEMENT* gpuRing,\n" " uint32_t startElem,uint32_t nbElementToProcess,\n" - " " + mType + "int32_t *elemOffsets, uint32_t offsetStride,\n" " uint32_t nbPart," + mType + "AT_FLOAT* rin," + mType + "AT_FLOAT* rout,\n" " " + mType + "uint32_t* lost, uint32_t turn,\n" " " + mType + "int32_t *refpts, uint32_t nbRef,\n" @@ -228,12 +227,9 @@ void Lattice::generateGPUKernel() { code.append(" sr6[4] = rin[4 + 6*threadId];\n"); // d = (pz-p0)/p0 code.append(" sr6[5] = rin[5 + 6*threadId];\n"); // c.tau (time lag) - // The tracking starts at elem elemOffset (equivalent to lattice.rotate(elemOffset)) - code.append(" int32_t elemOffset = elemOffsets?elemOffsets[threadId/offsetStride]:0;\n"); - // Loop over elements code.append(" while(!pLost && elemType) {\n"); factory.generatePassMethodsCalls(code); @@ -307,7 +303,7 @@ uint64_t Lattice::fillGPUMemory() { } void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *rout,uint32_t nbRef, - uint32_t *refPts,uint32_t nbElemOffset,uint32_t *elemOffsets, + uint32_t *refPts,uint32_t startElem,uint32_t endElem, uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord, bool updateRin) { @@ -371,28 +367,14 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r if(lostAtElem) gpu->allocDevice(&gpuLostAtElem, nbParticles * sizeof(uint32_t)); if(lostAtCoord) gpu->allocDevice(&gpuLostAtCoord, nbParticles * 6 * sizeof(AT_FLOAT)); - // Starting element (lattice rotation) - void *gpuOffsets = nullptr; - uint32_t offsetStride = 0; - if(nbElemOffset) { - gpu->allocDevice(&gpuOffsets, nbElemOffset * sizeof(int32_t)); - gpu->hostToDevice(gpuOffsets, elemOffsets, nbElemOffset * sizeof(int32_t)); - // nbParticles % nbElemOffset must be 0 - // offsetStride (nbParticles/nbElemOffset) should be a multiple of 64 - offsetStride = nbParticles / nbElemOffset; - } - // Call GPU gpu->resetArg(); - uint32_t startElem; - uint32_t nbElemToProcess = (uint32_t)elements.size(); + uint32_t nbElemToProcess = endElem - startElem; uint32_t turn; gpu->addArg(sizeof(void *),&gpuRingParams); // Global ring params gpu->addArg(sizeof(void *),&gpuRing); // The lattice gpu->addArg(sizeof(uint32_t),&startElem); // Process from gpu->addArg(sizeof(uint32_t),&nbElemToProcess); // Number of element to process - gpu->addArg(sizeof(void *),&gpuOffsets); // Tracking start offsets - gpu->addArg(sizeof(uint32_t),&offsetStride); // Number of particle which starts at elem specified in gpuOffsets gpu->addArg(sizeof(uint32_t),&nbParticles); // Total number of particle to track gpu->addArg(sizeof(void *),&gpuRin); // Input 6D coordinates gpu->addArg(sizeof(void *),&gpuRout); // Output 6D coordinates @@ -405,8 +387,9 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r // Turn loop for(turn=0;turnrun(nbParticles); // 1 particle per thread + startElem = 0; + nbElemToProcess = (uint32_t)elements.size(); } // Get back data @@ -441,7 +424,6 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r gpu->freeDevice(gpuRingParams); if( gpuLostAtElem ) gpu->freeDevice(gpuLostAtElem); if( gpuLostAtCoord ) gpu->freeDevice(gpuLostAtCoord); - if( gpuOffsets ) gpu->freeDevice(gpuOffsets); delete[] expandedRefPts; delete[] lost; diff --git a/atgpu/Lattice.h b/atgpu/Lattice.h index e3e2b5bed..e5785854c 100644 --- a/atgpu/Lattice.h +++ b/atgpu/Lattice.h @@ -27,7 +27,7 @@ class Lattice { void generateGPUKernel(); // Run the simulation void run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *rout,uint32_t nbRef,uint32_t *refPts, - uint32_t nbElemOffset,uint32_t *elemOffsets,uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord, + uint32_t startElem,uint32_t endElem,uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord, bool updateRin); // Return handle to the GPU context GPUContext *getGPUContext(); diff --git a/atgpu/PyInterface.cpp b/atgpu/PyInterface.cpp index 539b78b40..1bb62e1e6 100755 --- a/atgpu/PyInterface.cpp +++ b/atgpu/PyInterface.cpp @@ -41,18 +41,20 @@ static PyMethodDef AtGPUMethods[] = { " reuse: if True, use previously cached description of the lattice.\n" " losses: if True, process losses\n" " gpu_pool: List of GPU id to use\n" - " tracking_starts: numpy array of indices of elements where tracking should start.\n" - " len(tracking_start) must divide the number of particle and it gives the stride size.\n" - " The i-th particle of rin starts at elem tracking_start[i/stride].\n" - " The behavior is similar to lattice.rotate(tracking_starts[i/stride]).\n" - " The stride size should be multiple of 64 for best performance.\n" " integrator: Type of integrator to use.\n" " 1: Euler 1st order, 1 drift/1 kick per step.\n" " 2: Mclachlan 2nd order, 2 drift/2 kicks per step.\n" " 3: Ruth 3rd order, 3 drifts/3 kicks per step.\n" " 4: Forest/Ruth 4th order, 4 drifts/3 kicks per step (Default).\n" " 5: Optimal 4th order from R. Mclachlan, 4 drifts/4 kicks per step.\n" - " 6: Yoshida 6th order, 8 drifts/7 kicks per step.\n\n" + " 6: Yoshida 6th order, 8 drifts/7 kicks per step.\n" + " start_elem: Start tracking at the specified element. The\n" + " behavior is different from :py:func:`..lattice.rotate`, refpts are\n" + " still indexed from the begininig of the ring and s_coord starts at\n" + " the beginning of the ring. Default 0.\n" + " end_elem: End tracking at the specified element (exclusive).\n" + " When end_elem is different from the number of lattice elements,\n" + " nturns=1 is required. Default number of elements.\n\n" "Returns:\n" " rout: 6 x n_particles x n_refpts x n_turns Fortran-ordered numpy array\n" " of particle coordinates\n\n" @@ -138,7 +140,8 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) { "energy", "particle", "keep_counter", "reuse","losses", "bunch_spos", "bunch_currents", "gpu_pool", - "tracking_starts","integrator","verbose", + "integrator","verbose", + "start_elem","end_elem", nullptr}; NPY_TYPES floatType; @@ -168,10 +171,12 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) { int counter=0; int losses=0; int integratorType=4; + int startElem=0; + int endElem=-1; double t0,t1; // Get input args - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!i|O!$iO!O!pppO!O!O!O!ip", const_cast(kwlist), + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!i|O!$iO!O!pppO!O!O!ipii", const_cast(kwlist), &PyList_Type, &lattice, &PyArray_Type, &rin, &num_turns, @@ -185,9 +190,10 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) { &PyArray_Type, &bspos, &PyArray_Type, &bcurrents, &PyList_Type, &gpupool, - &PyArray_Type, &trackstarts, &integratorType, - &verbose + &verbose, + &startElem, + &endElem )) { return nullptr; } @@ -292,6 +298,32 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) { } + int num_elements = gpuLattice->getNbElement(); + + // Check for partial turn tracking + if(endElem==-1) endElem = num_elements; + if(num_elements!=0) { + if(startElem<0 || startElem>=num_elements) { + char errMsg[64]; + sprintf(errMsg,"start_elem out of range, %d not in [0..%d]",startElem,num_elements-1); + return PyErr_Format(PyExc_ValueError, errMsg); + } + if(endElem<0 || endElem>num_elements) { + char errMsg[64]; + sprintf(errMsg,"end_elem out of range, %d not in [0..%d]",endElem,num_elements); + return PyErr_Format(PyExc_ValueError, errMsg); + } + if(startElem>endElem) { + return PyErr_Format(PyExc_ValueError, "end_elem must be greater that start_elem"); + } + if((endElem!=num_elements) && (num_turns!=1)) { + return PyErr_Format(PyExc_ValueError, "partial turn tracking not ending at the end of the ring requires nturns=1"); + } + } else { + startElem = 0; + endElem = 0; + } + // Load lattice on the GPU try { uint64_t size = gpuLattice->fillGPUMemory(); @@ -309,7 +341,8 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) { try { if( verbose ) - cout << "Tracking " << num_particles << " particles for " << num_turns << " turns on " << gpuLattice->getGPUContext()->name() << " #" << gpuId << endl; + cout << "Tracking " << num_particles << " particles for " << num_turns << " turns on " << + gpuLattice->getGPUContext()->name() << " #" << gpuId << " Integrator: #" << integratorType << endl; npy_intp outdims[4] = {6,(npy_intp)(num_particles),num_refs,num_turns}; PyObject *rout = PyArray_EMPTY(4, outdims, floatType, 1); @@ -330,7 +363,7 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) { bool *xlostPtr = (bool *)PyArray_DATA((PyArrayObject *)xlost); AT_FLOAT *xlostcoordPtr = (AT_FLOAT *)PyArray_DATA((PyArrayObject *)xlostcoord); - gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,num_starts,track_starts, + gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,startElem,endElem, xnturnPtr,xnelemPtr,xlostcoordPtr,true); // Format result for AT @@ -355,7 +388,7 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) { } else { - gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,num_starts,track_starts, + gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,startElem,endElem, nullptr,nullptr,nullptr,true); return rout; diff --git a/atgpu/main.cpp b/atgpu/main.cpp index 1e3a85e2b..0630df41f 100755 --- a/atgpu/main.cpp +++ b/atgpu/main.cpp @@ -134,7 +134,7 @@ void integratorTest(int gpu,string latticeName) { // Choose an arc close to 1mm where unexpected tune drift is observed when step size in too small (EBS lattice) AT_FLOAT *rin = createArc(0.001,M_PI/2.0,-M_PI/2.0,nbPart); - l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, nullptr, nullptr, nullptr, nullptr, false); + l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, l->getNbElement()-1, nullptr, nullptr, nullptr, false); double err = 0; double max = 0; @@ -209,9 +209,6 @@ void performanceTest(int gpu,string latticeName) { uint32_t nbPart = nbX * nbY; uint32_t refs[] = {l->getNbElement()}; uint32_t nbRef = sizeof(refs) / sizeof(uint32_t); - uint32_t starts[] = {0,100,200,300,400,500,600,700}; - uint32_t nbStride = sizeof(starts) / sizeof(uint32_t); - //uint32_t nbStride = 0; uint64_t routSize = nbTurn * nbPart * nbRef * 6 * sizeof(AT_FLOAT); AT_FLOAT *rout = (AT_FLOAT *) malloc(routSize); @@ -222,7 +219,7 @@ void performanceTest(int gpu,string latticeName) { AT_FLOAT *lostAtCoord = new AT_FLOAT[nbPart * 6]; t0 = AbstractGPU::get_ticks(); - l->run(nbTurn, nbPart, rin, rout, nbRef, refs, nbStride, starts, lostAtTurn, lostAtElem, lostAtCoord,false); + l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, l->getNbElement()-1, lostAtTurn, lostAtElem, lostAtCoord,false); t1 = AbstractGPU::get_ticks(); AT_FLOAT *P = &rin[114*6]; @@ -231,9 +228,8 @@ void performanceTest(int gpu,string latticeName) { cout << "Lost elem:" << lostAtElem[114] << endl; cout << "Lost coord:" << lostAtCoord[114*6] << "," << lostAtCoord[114*6+2] << endl; - uint32_t strideSize = nbPart / nbStride; - for(int stride=0; stride < nbStride; stride++) { - P = ROUTPTR(stride*strideSize, 0, nbTurn - 1); + for(int stride=0; stride < 8; stride++) { + P = ROUTPTR(stride*8, 0, nbTurn - 1); cout << "[" << stride << "] " << P[0] << " " << P[1] << " " << P[2] << " " << P[3] << " " << P[4] << " " << P[5] << endl; } diff --git a/pyat/at.c b/pyat/at.c index 8a7d673a2..778790acf 100644 --- a/pyat/at.c +++ b/pyat/at.c @@ -332,12 +332,14 @@ void set_current_fillpattern(PyArrayObject *bspos, PyArrayObject *bcurrents, * - nturns: int number of turns to simulate * - refpts: numpy uint32 array denoting elements at which to return state * - reuse: whether to reuse the cached state of the ring + * - start_elem: start tracking at element #start_elem + * - end_elem: end tracking at element #end_elem (exclusive) */ static PyObject *at_atpass(PyObject *self, PyObject *args, PyObject *kwargs) { static char *kwlist[] = {"line","rin","nturns","refpts","turn", "energy", "particle", "keep_counter", "reuse","omp_num_threads","losses", - "bunch_spos", "bunch_currents", NULL}; + "bunch_spos", "bunch_currents", "start_elem", "end_elem", NULL}; static double lattice_length = 0.0; static int last_turn = 0; static int valid = 0; @@ -363,6 +365,8 @@ static PyObject *at_atpass(PyObject *self, PyObject *args, PyObject *kwargs) { npy_uint32 omp_num_threads=0; npy_uint32 num_particles, np6; npy_uint32 elem_index; + npy_uint32 start_elem = 0; + npy_uint32 end_elem = 0; npy_uint32 *refpts = NULL; npy_uint32 nextref; unsigned int nextrefindex; @@ -387,12 +391,13 @@ static PyObject *at_atpass(PyObject *self, PyObject *args, PyObject *kwargs) { bspos=NULL; bcurrents=NULL; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!i|O!$iO!O!ppIpO!O!", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!i|O!$iO!O!ppIpO!O!ii", kwlist, &PyList_Type, &lattice, &PyArray_Type, &rin, &num_turns, &PyArray_Type, &refs, &counter, &PyFloat_Type ,&energy, particle_type, &particle, &keep_counter, &keep_lattice, &omp_num_threads, &losses, - &PyArray_Type, &bspos, &PyArray_Type, &bcurrents)) { + &PyArray_Type, &bspos, &PyArray_Type, &bcurrents, + &start_elem,&end_elem)) { return NULL; } if (PyArray_DIM(rin,0) != 6) { @@ -411,61 +416,14 @@ static PyObject *at_atpass(PyObject *self, PyObject *args, PyObject *kwargs) { param.rest_energy=0.0; param.charge=-1.0; param.num_turns=num_turns; - param.bdiff = NULL; - + param.bdiff = NULL; if (keep_counter) param.nturn = last_turn; else param.nturn = counter; - set_energy_particle(lattice, energy, particle, ¶m); set_current_fillpattern(bspos, bcurrents, ¶m); - num_particles = (PyArray_SIZE(rin)/6); - np6 = num_particles*6; - drin = PyArray_DATA(rin); - - if (refs) { - if (PyArray_TYPE(refs) != NPY_UINT32) { - return PyErr_Format(PyExc_ValueError, "refpts is not a uint32 array"); - } - refpts = PyArray_DATA(refs); - num_refpts = PyArray_SIZE(refs); - } - else { - refpts = NULL; - num_refpts = 0; - } - outdims[0] = 6; - outdims[1] = num_particles; - outdims[2] = num_refpts; - outdims[3] = num_turns; - rout = PyArray_EMPTY(4, outdims, NPY_DOUBLE, 1); - drout = PyArray_DATA((PyArrayObject *)rout); - - if(losses){ - pdims[0]= num_particles; - lxdims[0]= 6; - lxdims[1]= num_particles; - xnturn = PyArray_EMPTY(1, pdims, NPY_UINT32, 1); - xnelem = PyArray_EMPTY(1, pdims, NPY_UINT32, 1); - xlost = PyArray_EMPTY(1, pdims, NPY_BOOL, 1); - xlostcoord = PyArray_EMPTY(2, lxdims, NPY_DOUBLE, 1); - ixnturn = PyArray_DATA((PyArrayObject *)xnturn); - ixnelem = PyArray_DATA((PyArrayObject *)xnelem); - bxlost = PyArray_DATA((PyArrayObject *)xlost); - dxlostcoord = PyArray_DATA((PyArrayObject *)xlostcoord); - unsigned int i; - static double r0[6]; - for(i=0;i 0) && (num_particles > OMP_PARTICLE_THRESHOLD)) { unsigned int nthreads = omp_get_num_procs(); @@ -521,10 +479,10 @@ static PyObject *at_atpass(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *el = PyList_GET_ITEM(lattice, elem_index); PyObject *PyPassMethod = PyObject_GetAttrString(el, "PassMethod"); double length; - if (!PyPassMethod) return print_error(elem_index, rout); /* No PassMethod: AttributeError */ + if (!PyPassMethod) return NULL; /* No PassMethod: AttributeError */ LibraryListPtr = get_track_function(PyUnicode_AsUTF8(PyPassMethod)); Py_DECREF(PyPassMethod); - if (!LibraryListPtr) return print_error(elem_index, rout); /* No trackFunction for the given PassMethod: RuntimeError */ + if (!LibraryListPtr) return NULL; /* No trackFunction for the given PassMethod: RuntimeError */ pylength = PyObject_GetAttrString(el, "Length"); if (pylength) { length = PyFloat_AsDouble(pylength); @@ -544,18 +502,88 @@ static PyObject *at_atpass(PyObject *self, PyObject *args, PyObject *kwargs) { valid = 0; } + /* Check for partial turn tracking */ + if(end_elem==0) end_elem = num_elements; + if(num_elements!=0) { + if(start_elem<0 || start_elem>=num_elements) { + char errMsg[64]; + sprintf(errMsg,"start_elem out of range, %d not in [0..%d]",start_elem,num_elements-1); + return PyErr_Format(PyExc_ValueError, errMsg); + } + if(end_elem<0 || end_elem>num_elements) { + char errMsg[64]; + sprintf(errMsg,"end_elem out of range, %d not in [0..%d]",start_elem,num_elements); + return PyErr_Format(PyExc_ValueError, errMsg); + } + if(start_elem>end_elem) { + return PyErr_Format(PyExc_ValueError, "end_elem must be greater that start_elem"); + } + if((end_elem!=num_elements) && (num_turns!=1)) { + return PyErr_Format(PyExc_ValueError, "partial turn tracking not ending at the end of the ring requires nturns=1"); + } + } else { + start_elem = 0; + end_elem = 0; + } + + /* Particle energy */ param.RingLength = lattice_length; if (param.rest_energy == 0.0) { param.T0 = param.RingLength/C0; - } - else { + } else { double gamma0 = param.energy/param.rest_energy; double betagamma0 = sqrt(gamma0*gamma0 - 1.0); double beta0 = betagamma0/gamma0; param.T0 = param.RingLength/beta0/C0; } + set_energy_particle(lattice, energy, particle, ¶m); + + /* Memory allocation for outputs */ + num_particles = (PyArray_SIZE(rin)/6); + np6 = num_particles*6; + drin = PyArray_DATA(rin); + + if (refs) { + if (PyArray_TYPE(refs) != NPY_UINT32) { + return PyErr_Format(PyExc_ValueError, "refpts is not a uint32 array"); + } + refpts = PyArray_DATA(refs); + num_refpts = PyArray_SIZE(refs); + } else { + refpts = NULL; + num_refpts = 0; + } + outdims[0] = 6; + outdims[1] = num_particles; + outdims[2] = num_refpts; + outdims[3] = num_turns; + rout = PyArray_EMPTY(4, outdims, NPY_DOUBLE, 1); + drout = PyArray_DATA((PyArrayObject *)rout); + + if(losses){ + pdims[0]= num_particles; + lxdims[0]= 6; + lxdims[1]= num_particles; + xnturn = PyArray_EMPTY(1, pdims, NPY_UINT32, 1); + xnelem = PyArray_EMPTY(1, pdims, NPY_UINT32, 1); + xlost = PyArray_EMPTY(1, pdims, NPY_BOOL, 1); + xlostcoord = PyArray_EMPTY(2, lxdims, NPY_DOUBLE, 1); + ixnturn = PyArray_DATA((PyArrayObject *)xnturn); + ixnelem = PyArray_DATA((PyArrayObject *)xnelem); + bxlost = PyArray_DATA((PyArrayObject *)xlost); + dxlostcoord = PyArray_DATA((PyArrayObject *)xlostcoord); + unsigned int i; + static double r0[6]; + for(i=0;i