atcollab · swhite2401 · Aug 21, 2025 · Aug 18, 2025 · Aug 18, 2025 · Aug 18, 2025
diff --git a/atgpu/Lattice.cpp b/atgpu/Lattice.cpp
@@ -200,7 +200,6 @@ void Lattice::generateGPUKernel() {
   // GPU main track function
   code.append(kType + "void track(" + mType + "RING_PARAM *ringParam," + mType + "ELEMENT* gpuRing,\n"
               "                   uint32_t startElem,uint32_t nbElementToProcess,\n"
-              "                   " + mType + "int32_t *elemOffsets, uint32_t offsetStride,\n"
               "                   uint32_t nbPart," + mType + "AT_FLOAT* rin," + mType + "AT_FLOAT* rout,\n"
               "                   " + mType + "uint32_t* lost, uint32_t turn,\n"
               "                   " + mType + "int32_t *refpts, uint32_t nbRef,\n"
@@ -228,12 +227,9 @@ void Lattice::generateGPUKernel() {
   code.append("  sr6[4] = rin[4 + 6*threadId];\n"); // d = (pz-p0)/p0
   code.append("  sr6[5] = rin[5 + 6*threadId];\n"); // c.tau (time lag)
 
-  // The tracking starts at elem elemOffset (equivalent to lattice.rotate(elemOffset))
-  code.append("  int32_t elemOffset = elemOffsets?elemOffsets[threadId/offsetStride]:0;\n");
-
   // Loop over elements
   code.append("  while(!pLost && elem<startElem+nbElementToProcess) {\n");
-  code.append("    "+mType+" ELEMENT* elemPtr = gpuRing + ((elem+elemOffset)%NB_TOTAL_ELEMENT);\n");
+  code.append("    "+mType+" ELEMENT* elemPtr = gpuRing + elem;\n");
   storeParticleCoord(code);
   code.append("    switch(elemPtr->Type) {\n");
   factory.generatePassMethodsCalls(code);
@@ -307,7 +303,7 @@ uint64_t Lattice::fillGPUMemory() {
 }
 
 void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *rout,uint32_t nbRef,
-                  uint32_t *refPts,uint32_t nbElemOffset,uint32_t *elemOffsets,
+                  uint32_t *refPts,uint32_t startElem,uint32_t endElem,
                   uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord,
                   bool updateRin) {
 
@@ -371,28 +367,14 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r
   if(lostAtElem) gpu->allocDevice(&gpuLostAtElem, nbParticles * sizeof(uint32_t));
   if(lostAtCoord) gpu->allocDevice(&gpuLostAtCoord, nbParticles * 6 * sizeof(AT_FLOAT));
 
-  // Starting element (lattice rotation)
-  void *gpuOffsets = nullptr;
-  uint32_t offsetStride = 0;
-  if(nbElemOffset) {
-    gpu->allocDevice(&gpuOffsets, nbElemOffset * sizeof(int32_t));
-    gpu->hostToDevice(gpuOffsets, elemOffsets, nbElemOffset * sizeof(int32_t));
-    // nbParticles % nbElemOffset must be 0
-    // offsetStride (nbParticles/nbElemOffset) should be a multiple of 64
-    offsetStride = nbParticles / nbElemOffset;
-  }
-
   // Call GPU
   gpu->resetArg();
-  uint32_t startElem;
-  uint32_t nbElemToProcess = (uint32_t)elements.size();
+  uint32_t nbElemToProcess = endElem - startElem;
   uint32_t turn;
   gpu->addArg(sizeof(void *),&gpuRingParams);       // Global ring params
   gpu->addArg(sizeof(void *),&gpuRing);             // The lattice
   gpu->addArg(sizeof(uint32_t),&startElem);         // Process from
   gpu->addArg(sizeof(uint32_t),&nbElemToProcess);   // Number of element to process
-  gpu->addArg(sizeof(void *),&gpuOffsets);          // Tracking start offsets
-  gpu->addArg(sizeof(uint32_t),&offsetStride);      // Number of particle which starts at elem specified in gpuOffsets
   gpu->addArg(sizeof(uint32_t),&nbParticles);       // Total number of particle to track
   gpu->addArg(sizeof(void *),&gpuRin);              // Input 6D coordinates
   gpu->addArg(sizeof(void *),&gpuRout);             // Output 6D coordinates
@@ -405,8 +387,9 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r
 
   // Turn loop
   for(turn=0;turn<nbTurn;turn++) {
-    startElem = 0;
     gpu->run(nbParticles); // 1 particle per thread
+    startElem = 0;
+    nbElemToProcess = (uint32_t)elements.size();
   }
 
   // Get back data
@@ -441,7 +424,6 @@ void Lattice::run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *r
   gpu->freeDevice(gpuRingParams);
   if( gpuLostAtElem ) gpu->freeDevice(gpuLostAtElem);
   if( gpuLostAtCoord ) gpu->freeDevice(gpuLostAtCoord);
-  if( gpuOffsets ) gpu->freeDevice(gpuOffsets);
   delete[] expandedRefPts;
   delete[] lost;
 

diff --git a/atgpu/Lattice.h b/atgpu/Lattice.h
@@ -27,7 +27,7 @@ class Lattice {
   void generateGPUKernel();
   // Run the simulation
   void run(uint64_t nbTurn,uint32_t nbParticles,AT_FLOAT *rin,AT_FLOAT *rout,uint32_t nbRef,uint32_t *refPts,
-           uint32_t nbElemOffset,uint32_t *elemOffsets,uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord,
+           uint32_t startElem,uint32_t endElem,uint32_t *lostAtTurn,uint32_t *lostAtElem,AT_FLOAT *lostAtCoord,
            bool updateRin);
   // Return handle to the GPU context
   GPUContext *getGPUContext();

diff --git a/atgpu/PyInterface.cpp b/atgpu/PyInterface.cpp
@@ -41,18 +41,20 @@ static PyMethodDef AtGPUMethods[] = {
                           "    reuse:   if True, use previously cached description of the lattice.\n"
                           "    losses:  if True, process losses\n"
                           "    gpu_pool:  List of GPU id to use\n"
-                          "    tracking_starts: numpy array of indices of elements where tracking should start.\n"
-                          "       len(tracking_start) must divide the number of particle and it gives the stride size.\n"
-                          "       The i-th particle of rin starts at elem tracking_start[i/stride].\n"
-                          "       The behavior is similar to lattice.rotate(tracking_starts[i/stride]).\n"
-                          "       The stride size should be multiple of 64 for best performance.\n"
                           "    integrator: Type of integrator to use.\n"
                           "       1: Euler 1st order, 1 drift/1 kick per step.\n"
                           "       2: Mclachlan 2nd order, 2 drift/2 kicks per step.\n"
                           "       3: Ruth 3rd order, 3 drifts/3 kicks per step.\n"
                           "       4: Forest/Ruth 4th order, 4 drifts/3 kicks per step (Default).\n"
                           "       5: Optimal 4th order from R. Mclachlan, 4 drifts/4 kicks per step.\n"
-                          "       6: Yoshida 6th order, 8 drifts/7 kicks per step.\n\n"
+                          "       6: Yoshida 6th order, 8 drifts/7 kicks per step.\n"
+                          "    start_elem: Start tracking at the specified element. The\n"
+                          "       behavior is different from :py:func:`..lattice.rotate`, refpts are\n"
+                          "       still indexed from the begininig of the ring and s_coord starts at\n"
+                          "       the beginning of the ring. Default 0.\n"
+                          "    end_elem: End tracking at the specified element (exclusive).\n"
+                          "       When end_elem is different from the number of lattice elements,\n"
+                          "       nturns=1 is required. Default number of elements.\n\n"
                           "Returns:\n"
                           "    rout:    6 x n_particles x n_refpts x n_turns Fortran-ordered numpy array\n"
                           "         of particle coordinates\n\n"
@@ -138,7 +140,8 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
                                  "energy", "particle", "keep_counter",
                                  "reuse","losses",
                                  "bunch_spos", "bunch_currents", "gpu_pool",
-                                 "tracking_starts","integrator","verbose",
+                                 "integrator","verbose",
+                                 "start_elem","end_elem",
                                  nullptr};
 
   NPY_TYPES floatType;
@@ -168,10 +171,12 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
   int counter=0;
   int losses=0;
   int integratorType=4;
+  int startElem=0;
+  int endElem=-1;
   double t0,t1;
 
   // Get input args
-  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!i|O!$iO!O!pppO!O!O!O!ip", const_cast<char **>(kwlist),
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!O!i|O!$iO!O!pppO!O!O!ipii", const_cast<char **>(kwlist),
                                    &PyList_Type, &lattice,
                                    &PyArray_Type, &rin,
                                    &num_turns,
@@ -185,9 +190,10 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
                                    &PyArray_Type, &bspos,
                                    &PyArray_Type, &bcurrents,
                                    &PyList_Type, &gpupool,
-                                   &PyArray_Type, &trackstarts,
                                    &integratorType,
-                                   &verbose
+                                   &verbose,
+                                   &startElem,
+                                   &endElem
                                    )) {
     return nullptr;
   }
@@ -292,6 +298,32 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
 
   }
 
+  int num_elements = gpuLattice->getNbElement();
+
+  // Check for partial turn tracking
+  if(endElem==-1) endElem = num_elements;
+  if(num_elements!=0) {
+    if(startElem<0 || startElem>=num_elements) {
+      char errMsg[64];
+      sprintf(errMsg,"start_elem out of range, %d not in [0..%d]",startElem,num_elements-1);
+      return PyErr_Format(PyExc_ValueError, errMsg);
+    }
+    if(endElem<0 || endElem>num_elements) {
+      char errMsg[64];
+      sprintf(errMsg,"end_elem out of range, %d not in [0..%d]",endElem,num_elements);
+      return PyErr_Format(PyExc_ValueError, errMsg);
+    }
+    if(startElem>endElem) {
+      return PyErr_Format(PyExc_ValueError, "end_elem must be greater that start_elem");
+    }
+    if((endElem!=num_elements) && (num_turns!=1)) {
+      return PyErr_Format(PyExc_ValueError, "partial turn tracking not ending at the end of the ring requires nturns=1");
+    }
+  } else {
+    startElem = 0;
+    endElem = 0;
+  }
+
   // Load lattice on the GPU
   try {
     uint64_t size = gpuLattice->fillGPUMemory();
@@ -309,7 +341,8 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
   try {
 
     if( verbose )
-      cout << "Tracking " << num_particles << " particles for " << num_turns << " turns on " << gpuLattice->getGPUContext()->name() << " #" << gpuId << endl;
+      cout << "Tracking " << num_particles << " particles for " << num_turns << " turns on " <<
+      gpuLattice->getGPUContext()->name() << " #" << gpuId << " Integrator: #" << integratorType << endl;
 
     npy_intp outdims[4] = {6,(npy_intp)(num_particles),num_refs,num_turns};
     PyObject *rout = PyArray_EMPTY(4, outdims, floatType, 1);
@@ -330,7 +363,7 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
       bool *xlostPtr = (bool *)PyArray_DATA((PyArrayObject *)xlost);
       AT_FLOAT *xlostcoordPtr = (AT_FLOAT *)PyArray_DATA((PyArrayObject *)xlostcoord);
 
-      gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,num_starts,track_starts,
+      gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,startElem,endElem,
                       xnturnPtr,xnelemPtr,xlostcoordPtr,true);
 
       // Format result for AT
@@ -355,7 +388,7 @@ static PyObject *at_gpupass(PyObject *self, PyObject *args, PyObject *kwargs) {
 
     } else {
 
-      gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,num_starts,track_starts,
+      gpuLattice->run(num_turns,num_particles,drin,drout,num_refs,ref_pts,startElem,endElem,
                       nullptr,nullptr,nullptr,true);
       return rout;
 

diff --git a/atgpu/main.cpp b/atgpu/main.cpp
@@ -134,7 +134,7 @@ void integratorTest(int gpu,string latticeName) {
       // Choose an arc close to 1mm where unexpected tune drift is observed when step size in too small (EBS lattice)
       AT_FLOAT *rin = createArc(0.001,M_PI/2.0,-M_PI/2.0,nbPart);
 
-      l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, nullptr, nullptr, nullptr, nullptr, false);
+      l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, l->getNbElement()-1, nullptr, nullptr, nullptr, false);
 
       double err = 0;
       double max = 0;
@@ -209,9 +209,6 @@ void performanceTest(int gpu,string latticeName) {
       uint32_t nbPart = nbX * nbY;
       uint32_t refs[] = {l->getNbElement()};
       uint32_t nbRef = sizeof(refs) / sizeof(uint32_t);
-      uint32_t starts[] = {0,100,200,300,400,500,600,700};
-      uint32_t nbStride = sizeof(starts) / sizeof(uint32_t);
-      //uint32_t nbStride = 0;
 
       uint64_t routSize = nbTurn * nbPart * nbRef * 6 * sizeof(AT_FLOAT);
       AT_FLOAT *rout = (AT_FLOAT *) malloc(routSize);
@@ -222,7 +219,7 @@ void performanceTest(int gpu,string latticeName) {
       AT_FLOAT *lostAtCoord = new AT_FLOAT[nbPart * 6];
 
       t0 = AbstractGPU::get_ticks();
-      l->run(nbTurn, nbPart, rin, rout, nbRef, refs, nbStride, starts, lostAtTurn, lostAtElem, lostAtCoord,false);
+      l->run(nbTurn, nbPart, rin, rout, nbRef, refs, 0, l->getNbElement()-1, lostAtTurn, lostAtElem, lostAtCoord,false);
       t1 = AbstractGPU::get_ticks();
 
       AT_FLOAT *P = &rin[114*6];
@@ -231,9 +228,8 @@ void performanceTest(int gpu,string latticeName) {
       cout << "Lost elem:" << lostAtElem[114] << endl;
       cout << "Lost coord:" << lostAtCoord[114*6] << "," << lostAtCoord[114*6+2] << endl;
 
-      uint32_t strideSize = nbPart / nbStride;
-      for(int stride=0; stride < nbStride; stride++) {
-        P = ROUTPTR(stride*strideSize, 0, nbTurn - 1);
+      for(int stride=0; stride < 8; stride++) {
+        P = ROUTPTR(stride*8, 0, nbTurn - 1);
         cout << "[" << stride << "] " << P[0] << " " << P[1] << " " << P[2] << " " << P[3] << " " << P[4] << " " << P[5] << endl;
       }