Skip to content

Commit 899b4b5

Browse files
committed
GS:SW: Use unaligned loads to reduce constant size on AVX2
Allows more instructions to use 1-byte offsets
1 parent f1c3cd8 commit 899b4b5

5 files changed

+100
-81
lines changed

pcsx2/GS/Renderers/SW/GSDrawScanline.cpp

+16-14
Original file line numberDiff line numberDiff line change
@@ -207,10 +207,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
207207
constexpr int vlen = sizeof(VectorF) / sizeof(float);
208208

209209
#if _M_SSE >= 0x501
210-
const GSVector8* shift = (GSVector8*)g_const_256b.m_shift;
211-
const GSVector4 step_shift = GSVector4::broadcast32(&shift[0]);
210+
auto load_shift = [](int i) { return GSVector8::load<false>(&g_const_256b.m_shift[8 - i]); };
211+
const GSVector4 step_shift = GSVector4::broadcast32(&g_const_256b.m_shift[0]);
212212
#else
213-
const GSVector4* shift = (GSVector4*)g_const_128b.m_shift;
213+
static const GSVector4* shift = reinterpret_cast<const GSVector4*>(g_const_128b.m_shift);
214+
auto load_shift = [](int i) { return shift[1 + i]; };
214215
const GSVector4 step_shift = shift[0];
215216
#endif
216217

@@ -234,22 +235,23 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
234235

235236
for (int i = 0; i < vlen; i++)
236237
{
237-
local.d[i].f = VectorI(df * shift[1 + i]).xxzzlh();
238+
local.d[i].f = VectorI(df * load_shift(i)).xxzzlh();
238239
}
239240
}
240241

241242
if (has_z && !sel.zequal)
242243
{
243-
const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z);
244244
const VectorF dzf(static_cast<float>(dscan.p.F64[1]));
245245
#if _M_SSE >= 0x501
246-
GSVector4::storel(&local.d8.p.z, dz.mul64(GSVector4::f32to64(shift)));
246+
double dz = dscan.p.F64[1] * g_const_256b.m_shift[0];
247+
memcpy(&local.d8.p.z, &dz, sizeof(dz));
247248
#else
249+
const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z);
248250
local.d4.z = dz.mul64(GSVector4::f32to64(shift));
249251
#endif
250252
for (int i = 0; i < vlen; i++)
251253
{
252-
local.d[i].z = dzf * shift[i + 1];
254+
local.d[i].z = dzf * load_shift(i);
253255
}
254256
}
255257
}
@@ -297,7 +299,7 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
297299

298300
for (int i = 0; i < vlen; i++)
299301
{
300-
VectorF v = dstq * shift[1 + i];
302+
VectorF v = dstq * load_shift(i);
301303

302304
if (sel.fst)
303305
{
@@ -336,8 +338,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
336338

337339
for (int i = 0; i < vlen; i++)
338340
{
339-
VectorI r = VectorI(dr * shift[1 + i]).ps32();
340-
VectorI b = VectorI(db * shift[1 + i]).ps32();
341+
VectorI r = VectorI(dr * load_shift(i)).ps32();
342+
VectorI b = VectorI(db * load_shift(i)).ps32();
341343

342344
local.d[i].rb = r.upl16(b);
343345
}
@@ -347,8 +349,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
347349

348350
for (int i = 0; i < vlen; i++)
349351
{
350-
VectorI g = VectorI(dg * shift[1 + i]).ps32();
351-
VectorI a = VectorI(da * shift[1 + i]).ps32();
352+
VectorI g = VectorI(dg * load_shift(i)).ps32();
353+
VectorI a = VectorI(da * load_shift(i)).ps32();
352354

353355
local.d[i].ga = g.upl16(a);
354356
}
@@ -515,7 +517,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV
515517
steps = pixels + skip - vlen;
516518
left -= skip;
517519
#if _M_SSE >= 0x501
518-
test = GSVector8i::i8to32(g_const_256b.m_test[skip]) | GSVector8i::i8to32(g_const_256b.m_test[15 + (steps & (steps >> 31))]);
520+
test = GSVector8i::i8to32(&g_const_256b.m_test[16 - skip]) | GSVector8i::i8to32(&g_const_256b.m_test[0 - (steps & (steps >> 31))]);
519521
#else
520522
test = const_test[skip] | const_test[7 + (steps & (steps >> 31))];
521523
#endif
@@ -1756,7 +1758,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV
17561758
if (!sel.notest)
17571759
{
17581760
#if _M_SSE >= 0x501
1759-
test = GSVector8i::i8to32(g_const_256b.m_test[15 + (steps & (steps >> 31))]);
1761+
test = GSVector8i::i8to32(&g_const_256b.m_test[0 - (steps & (steps >> 31))]);
17601762
#else
17611763
test = const_test[7 + (steps & (steps >> 31))];
17621764
#endif

pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp

+56-27
Original file line numberDiff line numberDiff line change
@@ -661,25 +661,29 @@ void GSDrawScanlineCodeGenerator::Init()
661661

662662
lea(a0.cvt32(), ptr[a0 + a1 - vecints]);
663663

664-
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
665-
666-
mov(eax, a0.cvt32());
667-
sar(eax, 31); // GH: 31 to extract the sign of the register
668-
and_(eax, a0.cvt32());
669-
if (isXmm)
670-
shl(eax, 4); // * sizeof(m_test[0])
671-
cdqe();
672-
673664
if (isXmm)
674665
{
666+
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
667+
mov(eax, a0.cvt32());
668+
sar(eax, 31); // GH: 31 to extract the sign of the register
669+
and_(eax, a0.cvt32());
670+
shl(eax, 4); // * sizeof(m_test[0])
671+
cdqe();
675672
shl(a1.cvt32(), 4); // * sizeof(m_test[0])
676673
movdqa(_test, ptr[a1 + _m_const + offsetof(GSScanlineConstantData128B, m_test[0])]);
677674
por(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]);
678675
}
679676
else
680677
{
681-
pmovsxbd(_test, ptr[a1 * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]);
682-
pmovsxbd(xym0, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[15])]);
678+
// GSVector8i test = loadu(&m_test[16 - skip]) | loadu(&m_test[steps >= 0 ? 0 : -steps]);
679+
mov(eax, a1.cvt32());
680+
neg(rax); // rax = -skip
681+
pmovsxbd(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[16])]);
682+
xor_(t0.cvt32(), t0.cvt32());
683+
mov(eax, a0.cvt32());
684+
neg(eax); // eax = -steps
685+
cmovs(eax, t0.cvt32()); // if (eax < 0) eax = 0
686+
pmovsxbd(xym0, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]);
683687
por(_test, xym0);
684688
shl(a1.cvt32(), 5); // * sizeof(m_test[0])
685689
}
@@ -922,7 +926,7 @@ void GSDrawScanlineCodeGenerator::Init()
922926
/// Inputs: a0=steps, t0=fza_offset
923927
/// Outputs[x86]: xym0=z xym2=s, xym3=t, xym4=q, xym5=rb, xym6=ga, xym7=test
924928
/// Destroys[x86]: all
925-
/// Destroys[x64]: xym0, xym1, xym2, xym3
929+
/// Destroys[x64]: xym0, xym1, xym2, xym3, t2
926930
void GSDrawScanlineCodeGenerator::Step()
927931
{
928932
// steps -= 4;
@@ -1048,19 +1052,22 @@ void GSDrawScanlineCodeGenerator::Step()
10481052

10491053
if (!m_sel.notest)
10501054
{
1055+
#if USING_XMM
10511056
// test = m_test[7 + (steps & (steps >> 31))];
10521057

10531058
mov(eax, a0.cvt32());
10541059
sar(eax, 31); // GH: 31 to extract the sign of the register
10551060
and_(eax, a0.cvt32());
1056-
if (isXmm)
1057-
shl(eax, 4);
1061+
shl(eax, 4);
10581062
cdqe();
1059-
1060-
#if USING_XMM
10611063
movdqa(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]);
10621064
#else
1063-
pmovsxbd(_test, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[15])]);
1065+
// test = loadu(&m_test[steps >= 0 ? 0 : -steps]);
1066+
xor_(t2.cvt32(), t2.cvt32());
1067+
mov(eax, a0.cvt32());
1068+
neg(eax); // eax = -steps
1069+
cmovs(eax, t2.cvt32()); // if (eax < 0) eax = 0;
1070+
pmovsxbd(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]);
10641071
#endif
10651072
}
10661073
}
@@ -1647,32 +1654,54 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
16471654
pslld(xym4, 9);
16481655
psrld(xym4, 9);
16491656

1650-
auto log2_coeff = [this](int i) -> Address
1657+
#if USING_YMM
1658+
auto load_log2_coeff = [this](const XYm& reg, int i)
16511659
{
1652-
if (isXmm)
1653-
return ptr[_m_const + offsetof(GSScanlineConstantData128B, m_log2_coef[i])];
1654-
else
1655-
return ptr[_m_const + offsetof(GSScanlineConstantData256B, m_log2_coef[i])];
1660+
vbroadcastss(reg, ptr[_m_const + offsetof(GSScanlineConstantData256B, m_log2_coef[i])]);
1661+
};
1662+
auto log2_coeff = [this, &load_log2_coeff](int i)
1663+
{
1664+
load_log2_coeff(xym6, i);
1665+
return xym6;
1666+
};
1667+
#else
1668+
auto log2_coeff = [this](int i) -> Operand
1669+
{
1670+
return ptr[_m_const + offsetof(GSScanlineConstantData128B, m_log2_coef[i])];
1671+
};
1672+
auto load_log2_coeff = [this, &log2_coeff](const XYm& reg, int i)
1673+
{
1674+
movaps(reg, log2_coeff(i));
16561675
};
1676+
#endif
16571677

1658-
orps(xym4, log2_coeff(3));
1678+
load_log2_coeff(xym1, 3);
1679+
orps(xym4, xym1);
16591680

16601681
// xym4 = mant(q) | 1.0f
16611682

16621683
if (hasFMA)
16631684
{
1664-
movaps(xym5, log2_coeff(0)); // c0
1685+
load_log2_coeff(xym5, 0); // c0
16651686
vfmadd213ps(xym5, xym4, log2_coeff(1)); // c0 * xym4 + c1
16661687
vfmadd213ps(xym5, xym4, log2_coeff(2)); // (c0 * xym4 + c1) * xym4 + c2
1667-
subps(xym4, log2_coeff(3)); // xym4 - 1.0f
1688+
subps(xym4, xym1); // xym4 - 1.0f
16681689
vfmadd213ps(xym4, xym5, xym0); // ((c0 * xym4 + c1) * xym4 + c2) * (xym4 - 1.0f) + xym0
16691690
}
16701691
else
16711692
{
1672-
THREEARG(mulps, xym5, xym4, log2_coeff(0));
1693+
if (hasAVX)
1694+
{
1695+
vmulps(xym5, xym4, log2_coeff(0));
1696+
}
1697+
else
1698+
{
1699+
load_log2_coeff(xym5, 0);
1700+
mulps(xym5, xym4);
1701+
}
16731702
addps(xym5, log2_coeff(1));
16741703
mulps(xym5, xym4);
1675-
subps(xym4, log2_coeff(3));
1704+
subps(xym4, xym1);
16761705
addps(xym5, log2_coeff(2));
16771706
mulps(xym4, xym5);
16781707
addps(xym4, xym0);

pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h

+4
Original file line numberDiff line numberDiff line change
@@ -204,10 +204,12 @@ class GSNewCodeGenerator
204204
FORWARD_OO_OI(or_)
205205
FORWARD_OO_OI(sub)
206206
FORWARD_OO_OI(xor_)
207+
FORWARD(2, BASE, cmovs, const Reg&, const Operand&)
207208
FORWARD(2, BASE, lea, const Reg&, const Address&)
208209
FORWARD(2, BASE, mov, const Operand&, size_t)
209210
FORWARD(2, BASE, mov, ARGS_OO)
210211
FORWARD(2, BASE, movzx, const Reg&, const Operand&)
212+
FORWARD(1, BASE, neg, const Operand&)
211213
FORWARD(1, BASE, not_, const Operand&)
212214
FORWARD(1, BASE, pop, const Operand&)
213215
FORWARD(1, BASE, push, const Operand&)
@@ -243,6 +245,8 @@ class GSNewCodeGenerator
243245
AFORWARD(2, minps, ARGS_XO)
244246
SFORWARD(2, movaps, ARGS_XO)
245247
SFORWARD(2, movaps, const Address&, const Xmm&)
248+
SFORWARD(2, movups, ARGS_XO)
249+
SFORWARD(2, movups, const Address&, const Xmm&)
246250
SFORWARD(2, movd, const Address&, const Xmm&)
247251
SFORWARD(2, movd, const Reg32&, const Xmm&)
248252
SFORWARD(2, movd, const Xmm&, const Address&)

pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h

+11-32
Original file line numberDiff line numberDiff line change
@@ -256,46 +256,25 @@ namespace GSScanlineConstantData
256256
// Constant shared by all threads (to reduce cache miss)
257257
struct alignas(64) GSScanlineConstantData256B
258258
{
259-
alignas(32) u8 m_test[16][8] = {
260-
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
261-
{0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
262-
{0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
263-
{0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
264-
{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
265-
{0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
266-
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
267-
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
268-
{0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
269-
{0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
270-
{0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
271-
{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
272-
{0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
273-
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
274-
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
275-
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
259+
// All AVX processors support unaligned access with little to no penalty as long as you don't cross a cache line.
260+
// Take advantage of that to store single vectors that we index with single-element alignment
261+
alignas(32) u8 m_test[24] = {
262+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
263+
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
264+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
276265
};
277-
alignas(32) float m_shift[9][8] = {
278-
{ 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f},
279-
{ 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f , 6.0f , 7.0f},
280-
{ -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f , 6.0f},
281-
{ -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f},
282-
{ -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f},
283-
{ -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f},
284-
{ -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f},
285-
{ -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f},
286-
{ -7.0f , -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f},
266+
float m_log2_coef[4] = {};
267+
alignas(64) float m_shift[16] = {
268+
8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f,
269+
0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f,
287270
};
288-
alignas(32) float m_log2_coef[4][8] = {};
289271

290272
constexpr GSScanlineConstantData256B()
291273
{
292274
using namespace GSScanlineConstantData;
293275
for (size_t n = 0; n < std::size(log2_coef); ++n)
294276
{
295-
for (size_t i = 0; i < 8; ++i)
296-
{
297-
m_log2_coef[n][i] = log2_coef[n];
298-
}
277+
m_log2_coef[n] = log2_coef[n];
299278
}
300279
}
301280
};

pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp

+13-8
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,12 @@ void GSSetupPrimCodeGenerator::Generate()
110110

111111
for (int i = 0; i < (m_sel.notest ? 2 : many_regs ? 9 : 5); i++)
112112
{
113-
movaps(XYm(3 + i), ptr[rax + i * vecsize]);
113+
if (isXmm)
114+
movaps(XYm(3 + i), ptr[rax + i * vecsize]);
115+
else if (i == 0)
116+
vbroadcastss(xym3, ptr[rax]);
117+
else
118+
movups(XYm(3 + i), ptr[rax + (9 - i) * sizeof(float)]);
114119
}
115120
}
116121

@@ -253,7 +258,7 @@ void GSSetupPrimCodeGenerator::Depth_YMM()
253258
if (i < 4 || many_regs)
254259
vmulps(ymm0, Ymm(4 + i), ymm1);
255260
else
256-
vmulps(ymm0, ymm1, ptr[g_const_256b.m_shift[i + 1]]);
261+
vmulps(ymm0, ymm1, ptr[&g_const_256b.m_shift[8 - i]]);
257262
cvttps2dq(ymm0, ymm0);
258263
pshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
259264
pshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
@@ -281,7 +286,7 @@ void GSSetupPrimCodeGenerator::Depth_YMM()
281286
if (i < 4 || many_regs)
282287
vmulps(ymm1, Ymm(4 + i), ymm0);
283288
else
284-
vmulps(ymm1, ymm0, ptr[g_const_256b.m_shift[i + 1]]);
289+
vmulps(ymm1, ymm0, ptr[&g_const_256b.m_shift[8 - i]]);
285290
movaps(_rip_local_di(i, z), ymm1);
286291
}
287292
}
@@ -356,7 +361,7 @@ void GSSetupPrimCodeGenerator::Texture()
356361
if (i < 4 || many_regs)
357362
THREEARG(mulps, xym2, XYm(4 + i), xym1);
358363
else
359-
vmulps(ymm2, ymm1, ptr[g_const_256b.m_shift[i + 1]]);
364+
vmulps(ymm2, ymm1, ptr[&g_const_256b.m_shift[8 - i]]);
360365

361366
if (m_sel.fst)
362367
{
@@ -424,7 +429,7 @@ void GSSetupPrimCodeGenerator::Color()
424429
if (i < 4 || many_regs)
425430
THREEARG(mulps, xym0, XYm(4 + i), xym2);
426431
else
427-
vmulps(ymm0, ymm2, ptr[g_const_256b.m_shift[i + 1]]);
432+
vmulps(ymm0, ymm2, ptr[&g_const_256b.m_shift[8 - i]]);
428433
cvttps2dq(xym0, xym0);
429434
packssdw(xym0, xym0);
430435

@@ -433,7 +438,7 @@ void GSSetupPrimCodeGenerator::Color()
433438
if (i < 4 || many_regs)
434439
THREEARG(mulps, xym1, XYm(4 + i), xym3);
435440
else
436-
vmulps(ymm1, ymm3, ptr[g_const_256b.m_shift[i + 1]]);
441+
vmulps(ymm1, ymm3, ptr[&g_const_256b.m_shift[8 - i]]);
437442
cvttps2dq(xym1, xym1);
438443
packssdw(xym1, xym1);
439444

@@ -460,7 +465,7 @@ void GSSetupPrimCodeGenerator::Color()
460465
if (i < 4 || many_regs)
461466
THREEARG(mulps, xym0, XYm(4 + i), xym2);
462467
else
463-
vmulps(ymm0, ymm2, ptr[g_const_256b.m_shift[i + 1]]);
468+
vmulps(ymm0, ymm2, ptr[&g_const_256b.m_shift[8 - i]]);
464469
cvttps2dq(xym0, xym0);
465470
packssdw(xym0, xym0);
466471

@@ -469,7 +474,7 @@ void GSSetupPrimCodeGenerator::Color()
469474
if (i < 4 || many_regs)
470475
THREEARG(mulps, xym1, XYm(4 + i), xym3);
471476
else
472-
vmulps(ymm1, ymm3, ptr[g_const_256b.m_shift[i + 1]]);
477+
vmulps(ymm1, ymm3, ptr[&g_const_256b.m_shift[8 - i]]);
473478
cvttps2dq(xym1, xym1);
474479
packssdw(xym1, xym1);
475480

0 commit comments

Comments
 (0)