@@ -125,27 +125,31 @@ void CComputeShader::ParseShaderSpecificOpcode(llvm::Instruction* inst)
125
125
}
126
126
}
127
127
128
- void CComputeShader::CreateThreadPayloadData (void * & pThreadPayload, uint& threadPayloadSize )
128
+ void CComputeShader::CreateThreadPayloadData (void * & pThreadPayload, uint& curbeTotalDataLength, uint& curbeReadLength )
129
129
{
130
+ typedef uint16_t ThreadPayloadEntry;
131
+
130
132
// Find the max thread group dimension
131
133
const OctEltUnit SIZE_OF_DQWORD = OctEltUnit (2 );
134
+ const OctEltUnit SIZE_OF_OWORD = OctEltUnit (1 );
132
135
uint numberOfId = GetNumberOfId ();
133
136
uint dimX = numLanes (m_dispatchSize);
134
- uint dimY = ( iSTD::Align (m_threadGroupSize, dimX)/dimX) * numberOfId;
135
-
136
- typedef uint ThreadPayloadEntry;
137
-
138
- uint alignedVal = EltUnit (SIZE_OF_DQWORD). Count () * sizeof (DWORD); // Oct Element is 8 DWORDS
137
+ // dimX must align to alignment_X bytes (one GRF)
138
+ uint alignment_X = EltUnit (SIZE_OF_OWORD). Count () * sizeof (DWORD);
139
+ uint dimX_aligned = iSTD::Align (dimX * sizeof ( ThreadPayloadEntry), alignment_X) / sizeof (ThreadPayloadEntry) ;
140
+ uint dimY = ( iSTD::Align (m_threadGroupSize, dimX) / dimX) * numberOfId;
141
+ curbeReadLength = dimX_aligned * numberOfId * sizeof (ThreadPayloadEntry) / alignment_X;
139
142
143
+ uint alignedVal = EltUnit (SIZE_OF_DQWORD).Count () * sizeof (ThreadPayloadEntry); // Oct Element is 8 Entries
140
144
// m_NOSBufferSize is the additional space for cross-thread constant data (constants set by driver).
141
- threadPayloadSize = iSTD::Align ( dimX * dimY * sizeof ( ThreadPayloadEntry ) + m_NOSBufferSize, alignedVal );
145
+ curbeTotalDataLength = iSTD::Align (dimX_aligned * dimY * sizeof (ThreadPayloadEntry) + m_NOSBufferSize, alignedVal);
142
146
143
147
assert (pThreadPayload == nullptr && " Thread payload should be a null variable" );
144
148
145
- unsigned threadPayloadEntries = threadPayloadSize / sizeof (ThreadPayloadEntry);
149
+ unsigned threadPayloadEntries = curbeTotalDataLength / sizeof (ThreadPayloadEntry);
146
150
147
151
ThreadPayloadEntry* pThreadPayloadMem =
148
- (ThreadPayloadEntry*)IGC::aligned_malloc (threadPayloadEntries* sizeof (ThreadPayloadEntry), 16 );
152
+ (ThreadPayloadEntry*)IGC::aligned_malloc (threadPayloadEntries * sizeof (ThreadPayloadEntry), 16 );
149
153
std::fill (pThreadPayloadMem, pThreadPayloadMem + threadPayloadEntries, 0 );
150
154
151
155
pThreadPayload = pThreadPayloadMem;
@@ -169,17 +173,17 @@ void CComputeShader::CreateThreadPayloadData(void* & pThreadPayload, uint& threa
169
173
uint lane = 0 ;
170
174
if (m_pThread_ID_in_Group_X)
171
175
{
172
- pThreadPayloadMem[(y + lane) * dimX + x] = currThreadX;
176
+ pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadX;
173
177
lane++;
174
178
}
175
179
if (m_pThread_ID_in_Group_Y)
176
180
{
177
- pThreadPayloadMem[(y + lane) * dimX + x] = currThreadY;
181
+ pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadY;
178
182
lane++;
179
183
}
180
184
if (m_pThread_ID_in_Group_Z)
181
185
{
182
- pThreadPayloadMem[(y + lane) * dimX + x] = currThreadZ;
186
+ pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadZ;
183
187
lane++;
184
188
}
185
189
@@ -259,19 +263,19 @@ CVariable* CComputeShader::CreateThreadIDinGroup(uint channelNum)
259
263
case 0 :
260
264
if (m_pThread_ID_in_Group_X == nullptr )
261
265
{
262
- m_pThread_ID_in_Group_X = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_D , EALIGN_GRF, false , m_numberInstance);
266
+ m_pThread_ID_in_Group_X = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_W , EALIGN_GRF, false , m_numberInstance);
263
267
}
264
268
return m_pThread_ID_in_Group_X;
265
269
case 1 :
266
270
if (m_pThread_ID_in_Group_Y == nullptr )
267
271
{
268
- m_pThread_ID_in_Group_Y = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_D , EALIGN_GRF, false , m_numberInstance);
272
+ m_pThread_ID_in_Group_Y = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_W , EALIGN_GRF, false , m_numberInstance);
269
273
}
270
274
return m_pThread_ID_in_Group_Y;
271
275
case 2 :
272
276
if (m_pThread_ID_in_Group_Z == nullptr )
273
277
{
274
- m_pThread_ID_in_Group_Z = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_D , EALIGN_GRF, false , m_numberInstance);
278
+ m_pThread_ID_in_Group_Z = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_W , EALIGN_GRF, false , m_numberInstance);
275
279
}
276
280
return m_pThread_ID_in_Group_Z;
277
281
default :
@@ -335,6 +339,7 @@ void CComputeShader::AllocatePayload()
335
339
{
336
340
AllocateInput (m_pThread_ID_in_Group_X, offset, i);
337
341
offset += m_pThread_ID_in_Group_X->GetSize ();
342
+ offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_X->GetAlign ()]);
338
343
}
339
344
}
340
345
@@ -344,6 +349,7 @@ void CComputeShader::AllocatePayload()
344
349
{
345
350
AllocateInput (m_pThread_ID_in_Group_Y, offset, i);
346
351
offset += m_pThread_ID_in_Group_Y->GetSize ();
352
+ offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_Y->GetAlign ()]);
347
353
}
348
354
}
349
355
@@ -353,6 +359,7 @@ void CComputeShader::AllocatePayload()
353
359
{
354
360
AllocateInput (m_pThread_ID_in_Group_Z, offset, i);
355
361
offset += m_pThread_ID_in_Group_Z->GetSize ();
362
+ offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_Z->GetAlign ()]);
356
363
}
357
364
}
358
365
@@ -466,8 +473,6 @@ void CComputeShader::FillProgram(SComputeShaderKernelProgram* pKernelProgram)
466
473
pKernelProgram->FloatingPointMode = USC::GFX3DSTATE_FLOATING_POINT_IEEE_754;
467
474
pKernelProgram->SingleProgramFlow = USC::GFX3DSTATE_PROGRAM_FLOW_MULTIPLE;
468
475
pKernelProgram->CurbeReadOffset = 0 ;
469
- pKernelProgram->CurbeReadLength = GetNumberOfId () * (numLanes (m_dispatchSize) / numLanes (SIMDMode::SIMD8));
470
-
471
476
pKernelProgram->PhysicalThreadsInGroup = static_cast <int >(
472
477
std::ceil ((static_cast <float >(m_threadGroupSize) /
473
478
static_cast <float >((numLanes (m_dispatchSize))))));
@@ -487,7 +492,8 @@ void CComputeShader::FillProgram(SComputeShaderKernelProgram* pKernelProgram)
487
492
pKernelProgram->ThreadPayloadData = nullptr ;
488
493
CreateThreadPayloadData (
489
494
pKernelProgram->ThreadPayloadData ,
490
- pKernelProgram->CurbeTotalDataLength );
495
+ pKernelProgram->CurbeTotalDataLength ,
496
+ pKernelProgram->CurbeReadLength );
491
497
492
498
pKernelProgram->ThreadGroupSize = m_threadGroupSize;
493
499
0 commit comments