diff --git a/CMakeLists.txt b/CMakeLists.txt index ecb05d4..7e4069a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,6 @@ add_library(parallel-rsp STATIC arch/simd/rsp/vcmp.h arch/simd/rsp/vdivh.h arch/simd/rsp/vmac.h - arch/simd/rsp/vmov.h arch/simd/rsp/vmrg.h arch/simd/rsp/vmudh.h arch/simd/rsp/vmul.h diff --git a/CREDITS.txt b/CREDITS.txt index 49f4064..d5f5eb0 100644 --- a/CREDITS.txt +++ b/CREDITS.txt @@ -1,11 +1,11 @@ Written by Themaister. -The code is heavily reliant on MarathonMan's CEN64 RSP implementation, as well as CXD4's RSP implementation. +The code is heavily reliant on MarathonMan's CEN64 RSP implementation, as well as Ares and CXD4's RSP implementations. MIPS core: Rewritten from scratch -CP0: Near copy-pasta from CEN64 +CP0: Near copy-pasta from CEN64, with some fixes from Ares brought in CP2: Near copy-pasta from CEN64 -LS pipe: Near copy-pasta from CXD4 +LS pipe: Ported from Ares Mupen64plus glue code: Reused most of CXD4. Lightning jitter interface: Written from scratch diff --git a/arch/simd/rsp/rsp_impl.h b/arch/simd/rsp/rsp_impl.h index f81dfa6..e12bce2 100644 --- a/arch/simd/rsp/rsp_impl.h +++ b/arch/simd/rsp/rsp_impl.h @@ -12,7 +12,6 @@ #include "vcr.h" #include "vdivh.h" #include "vmac.h" -#include "vmov.h" #include "vmrg.h" #include "vmul.h" #include "vmulh.h" diff --git a/arch/simd/rsp/vmov.h b/arch/simd/rsp/vmov.h deleted file mode 100644 index be5856e..0000000 --- a/arch/simd/rsp/vmov.h +++ /dev/null @@ -1,18 +0,0 @@ -// -// arch/x86_64/rsp/vmov.c -// -// This file is subject to the terms and conditions defined in -// 'LICENSE', which is part of this source code package. -// - -inline __m128i rsp_vmov(RSP::CPUState *rsp, unsigned src, unsigned e, unsigned dest, unsigned de) -{ - uint16_t data; - - // Get the element from VT. - data = rsp->cp2.regs[src].e[e & 0x7]; - - // Write out the upper part of the result. - rsp->cp2.regs[dest].e[de & 0x7] = data; - return rsp_vect_load_unshuffled_operand(rsp->cp2.regs[dest].e); -} diff --git a/parallel.cpp b/parallel.cpp index 62fa53e..c7ce7a0 100644 --- a/parallel.cpp +++ b/parallel.cpp @@ -52,7 +52,7 @@ extern "C" EXPORT unsigned int CALL parallelRSPDoRspCycles(unsigned int cycles) { - if (*RSP::rsp.SP_STATUS_REG & (SP_STATUS_HALT | SP_STATUS_BROKE)) + if (*RSP::rsp.SP_STATUS_REG & SP_STATUS_HALT) return 0; // We don't know if Mupen from the outside invalidated our IMEM. @@ -83,6 +83,8 @@ extern "C" return cycles; else if (*RSP::cpu.get_state().cp0.irq & 1) RSP::rsp.CheckInterrupts(); + else if (*RSP::rsp.SP_STATUS_REG & SP_STATUS_HALT) + return cycles; else if (*RSP::rsp.SP_SEMAPHORE_REG != 0) // Semaphore lock fixes. { } diff --git a/rsp/cp0.cpp b/rsp/cp0.cpp index 22d0ca5..e7bcafa 100644 --- a/rsp/cp0.cpp +++ b/rsp/cp0.cpp @@ -27,14 +27,10 @@ extern "C" if (rt) rsp->sr[rt] = res; - // CFG_MEND_SEMAPHORE_LOCK == 0 by default, - // so don't bother implementing semaphores. - // It makes Mario Golf run terribly for some reason. - #ifdef PARALLEL_INTEGRATION - // WAIT_FOR_CPU_HOST. From CXD4. if (rd == CP0_REGISTER_SP_STATUS) { + // Might be waiting for the CPU to set a signal bit on the STATUS register. Increment timeout RSP::MFC0_count[rt] += 1; if (RSP::MFC0_count[rt] >= RSP::SP_STATUS_TIMEOUT) { @@ -44,81 +40,65 @@ extern "C" } #endif + if (rd == CP0_REGISTER_SP_SEMAPHORE) + { + if (*rsp->cp0.cr[CP0_REGISTER_SP_SEMAPHORE]) + { +#ifdef PARALLEL_INTEGRATION + RSP::MFC0_count[rt] += 8; // Almost certainly waiting on the CPU. Timeout faster. + if (RSP::MFC0_count[rt] >= RSP::SP_STATUS_TIMEOUT) + { + *RSP::rsp.SP_STATUS_REG |= SP_STATUS_HALT; + return MODE_CHECK_FLAGS; + } +#endif + } + else + *rsp->cp0.cr[CP0_REGISTER_SP_SEMAPHORE] = 1; + } + //if (rd == 4) // SP_STATUS_REG // fprintf(stderr, "READING STATUS REG!\n"); return MODE_CONTINUE; } +#define RSP_HANDLE_STATUS_WRITE(flag) \ + switch (rt & (SP_SET_##flag | SP_CLR_##flag)) \ + { \ + case SP_SET_##flag: status |= SP_STATUS_##flag; break; \ + case SP_CLR_##flag: status &= ~SP_STATUS_##flag; break; \ + default: break; \ + } + static inline int rsp_status_write(RSP::CPUState *rsp, uint32_t rt) { //fprintf(stderr, "Writing 0x%x to status reg!\n", rt); uint32_t status = *rsp->cp0.cr[CP0_REGISTER_SP_STATUS]; - if (rt & SP_CLR_HALT) - status &= ~SP_STATUS_HALT; - else if (rt & SP_SET_HALT) - status |= SP_STATUS_HALT; + RSP_HANDLE_STATUS_WRITE(HALT) + RSP_HANDLE_STATUS_WRITE(SSTEP) + RSP_HANDLE_STATUS_WRITE(INTR_BREAK) + RSP_HANDLE_STATUS_WRITE(SIG0) + RSP_HANDLE_STATUS_WRITE(SIG1) + RSP_HANDLE_STATUS_WRITE(SIG2) + RSP_HANDLE_STATUS_WRITE(SIG3) + RSP_HANDLE_STATUS_WRITE(SIG4) + RSP_HANDLE_STATUS_WRITE(SIG5) + RSP_HANDLE_STATUS_WRITE(SIG6) + RSP_HANDLE_STATUS_WRITE(SIG7) + + switch (rt & (SP_SET_INTR | SP_CLR_INTR)) + { + case SP_SET_INTR: *rsp->cp0.irq |= 1; break; + case SP_CLR_INTR: *rsp->cp0.irq &= ~1; break; + default: break; + } if (rt & SP_CLR_BROKE) status &= ~SP_STATUS_BROKE; - if (rt & SP_CLR_INTR) - *rsp->cp0.irq &= ~1; - else if (rt & SP_SET_INTR) - *rsp->cp0.irq |= 1; - - if (rt & SP_CLR_SSTEP) - status &= ~SP_STATUS_SSTEP; - else if (rt & SP_SET_SSTEP) - status |= SP_STATUS_SSTEP; - - if (rt & SP_CLR_INTR_BREAK) - status &= ~SP_STATUS_INTR_BREAK; - else if (rt & SP_SET_INTR_BREAK) - status |= SP_STATUS_INTR_BREAK; - - if (rt & SP_CLR_SIG0) - status &= ~SP_STATUS_SIG0; - else if (rt & SP_SET_SIG0) - status |= SP_STATUS_SIG0; - - if (rt & SP_CLR_SIG1) - status &= ~SP_STATUS_SIG1; - else if (rt & SP_SET_SIG1) - status |= SP_STATUS_SIG1; - - if (rt & SP_CLR_SIG2) - status &= ~SP_STATUS_SIG2; - else if (rt & SP_SET_SIG2) - status |= SP_STATUS_SIG2; - - if (rt & SP_CLR_SIG3) - status &= ~SP_STATUS_SIG3; - else if (rt & SP_SET_SIG3) - status |= SP_STATUS_SIG3; - - if (rt & SP_CLR_SIG4) - status &= ~SP_STATUS_SIG4; - else if (rt & SP_SET_SIG4) - status |= SP_STATUS_SIG4; - - if (rt & SP_CLR_SIG5) - status &= ~SP_STATUS_SIG5; - else if (rt & SP_SET_SIG5) - status |= SP_STATUS_SIG5; - - if (rt & SP_CLR_SIG6) - status &= ~SP_STATUS_SIG6; - else if (rt & SP_SET_SIG6) - status |= SP_STATUS_SIG6; - - if (rt & SP_CLR_SIG7) - status &= ~SP_STATUS_SIG7; - else if (rt & SP_SET_SIG7) - status |= SP_STATUS_SIG7; - *rsp->cp0.cr[CP0_REGISTER_SP_STATUS] = status; return ((*rsp->cp0.irq & 1) || (status & SP_STATUS_HALT)) ? MODE_CHECK_FLAGS : MODE_CONTINUE; } @@ -178,6 +158,7 @@ extern "C" *rsp->cp0.cr[CP0_REGISTER_DMA_DRAM] = source; *rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] = dest; + *rsp->cp0.cr[CP0_REGISTER_DMA_READ_LENGTH] = 0xff8; #ifdef INTENSE_DEBUG log_rsp_mem_parallel(); @@ -231,6 +212,7 @@ extern "C" *rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] = source; *rsp->cp0.cr[CP0_REGISTER_DMA_DRAM] = dest; + *rsp->cp0.cr[CP0_REGISTER_DMA_WRITE_LENGTH] = 0xff8; #ifdef INTENSE_DEBUG log_rsp_mem_parallel(); #endif @@ -269,9 +251,9 @@ extern "C" case CP0_REGISTER_SP_STATUS: return rsp_status_write(rsp, val); - case CP0_REGISTER_SP_RESERVED: - // CXD4 forces this to 0. - *rsp->cp0.cr[CP0_REGISTER_SP_RESERVED] = 0; + case CP0_REGISTER_SP_SEMAPHORE: + // Any write to the semaphore register, regardless of value, sets it to 0 for the next read + *rsp->cp0.cr[CP0_REGISTER_SP_SEMAPHORE] = 0; break; case CP0_REGISTER_CMD_START: diff --git a/rsp/cp2.cpp b/rsp/cp2.cpp index 2d2780a..0c70250 100644 --- a/rsp/cp2.cpp +++ b/rsp/cp2.cpp @@ -29,23 +29,21 @@ extern "C" void RSP_MTC2(RSP::CPUState *rsp, unsigned rt, unsigned rd, unsigned element) { - uint16_t *e = rsp->cp2.regs[rd].e; - #ifdef INTENSE_DEBUG fprintf(stderr, "MTC2, rt = %u, [rt] = 0x%x, rd = %u, e = %u\n", rt, rsp->sr[rt], rd, element); #endif - unsigned lo = element >> 1; - rt = rsp->sr[rt]; - + uint16_t *e = rsp->cp2.regs[rd].e; + const uint16_t v = rsp->sr[rt]; if (element & 1) { - unsigned hi = (element + 1) >> 1; - e[lo] = (e[lo] & 0xff00) | ((rt >> 8) & 0xff); - e[hi] = (e[lo] & 0x00ff) | ((rt & 0xff) << 8); + const auto i = element >> 1; + e[i] = (e[i] & 0xff00) | (v >> 8); + if (element != 0xf) + e[i+1] = (e[i+1] & 0xff) | (v << 8); } else - e[lo] = rt; + e[element >> 1] = v; } void RSP_MFC2(RSP::CPUState *rsp, unsigned rt, unsigned rd, unsigned element) diff --git a/rsp/ls.cpp b/rsp/ls.cpp index e1acb21..0544dc3 100644 --- a/rsp/ls.cpp +++ b/rsp/ls.cpp @@ -9,10 +9,24 @@ extern "C" { - // Using mostly CXD4 implementation as a base here since it's easier to follow. - // CEN64's implementation seems much better, but takes more effort to port for now. - // Reading wide words together with SSE4 blend, SSSE3 pshufb, etc should make this much faster. + // Using mostly Ares' implementation as a base here + static inline uint8_t byteFromHalfWords(const uint16_t *arr, unsigned i) + { + return (i & 1) ? + (uint8_t)(arr[i >> 1] & 0xff) : + (uint8_t)(arr[i >> 1] >> 8); + } + + static inline void writeByteToHalfWords(uint16_t *arr, unsigned i, uint8_t b) + { + const unsigned n = i >> 1; + if (i & 1) + arr[n] = (arr[n] & 0xff00) | (uint16_t)b; + else + arr[n] = (arr[n] & 0xff) | ((uint16_t)b << 8); + } + // Load 8-bit void RSP_LBV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { @@ -39,21 +53,10 @@ extern "C" void RSP_LSV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(LSV); - if (e & 1) - return; - - unsigned addr = (rsp->sr[base] + offset * 2) & 0xfff; - unsigned correction = addr & 3; - if (correction == 3) - return; - - uint16_t result; - if (correction == 1) - result = (READ_MEM_U8(rsp->dmem, addr + 0) << 8) | (READ_MEM_U8(rsp->dmem, addr + 1) << 0); - else - result = READ_MEM_U16(rsp->dmem, addr); - - rsp->cp2.regs[rt].e[e >> 1] = result; + unsigned addr = rsp->sr[base] + offset * 2; + const unsigned end = (e > 14) ? 16 : (e + 2); + for (unsigned i = e; i < end; i++) + writeByteToHalfWords(rsp->cp2.regs[rt].e, i & 0xf, READ_MEM_U8(rsp->dmem, addr++ & 0xfff)); } // Store 16-bit @@ -76,63 +79,34 @@ extern "C" void RSP_LLV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(LLV); - unsigned addr = (rsp->sr[base] + offset * 4) & 0xfff; - if (e & 1) - return; - if (addr & 1) - return; - e >>= 1; - - rsp->cp2.regs[rt].e[e] = READ_MEM_U16(rsp->dmem, addr); - rsp->cp2.regs[rt].e[(e + 1) & 7] = READ_MEM_U16(rsp->dmem, (addr + 2) & 0xfff); + unsigned addr = rsp->sr[base] + offset * 4; + const unsigned end = (e > 12) ? 16 : (e + 4); + for (unsigned i = e; i < end; i++) + writeByteToHalfWords(rsp->cp2.regs[rt].e, i & 0xf, READ_MEM_U8(rsp->dmem, addr++ & 0xfff)); } // Store 32-bit void RSP_SLV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(SLV); - if ((e & 1) || (e > 0xc)) - return; unsigned addr = (rsp->sr[base] + offset * 4) & 0xfff; #ifdef INTENSE_DEBUG fprintf(stderr, "SLV 0x%x, e = %u\n", addr, e); #endif - if (addr & 1) - return; - e >>= 1; - - uint16_t v0 = rsp->cp2.regs[rt].e[e]; - uint16_t v1 = rsp->cp2.regs[rt].e[e + 1]; - WRITE_MEM_U16(rsp->dmem, addr, v0); - WRITE_MEM_U16(rsp->dmem, (addr + 2) & 0xfff, v1); + for (unsigned i = e; i < e + 4; i++) + WRITE_MEM_U8(rsp->dmem, addr++, byteFromHalfWords(rsp->cp2.regs[rt].e, i & 0xf)); } // Load 64-bit void RSP_LDV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(LDV); - if (e & 1) - return; - unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff; - auto *reg = rsp->cp2.regs[rt].e; - e >>= 1; - - if (addr & 1) - { - reg[e + 0] = (READ_MEM_U8(rsp->dmem, addr + 0) << 8) | READ_MEM_U8(rsp->dmem, addr + 1); - reg[e + 1] = (READ_MEM_U8(rsp->dmem, addr + 2) << 8) | READ_MEM_U8(rsp->dmem, addr + 3); - reg[e + 2] = (READ_MEM_U8(rsp->dmem, addr + 4) << 8) | READ_MEM_U8(rsp->dmem, addr + 5); - reg[e + 3] = (READ_MEM_U8(rsp->dmem, addr + 6) << 8) | READ_MEM_U8(rsp->dmem, addr + 7); - } - else - { - reg[e + 0] = READ_MEM_U16(rsp->dmem, addr); - reg[e + 1] = READ_MEM_U16(rsp->dmem, (addr + 2) & 0xfff); - reg[e + 2] = READ_MEM_U16(rsp->dmem, (addr + 4) & 0xfff); - reg[e + 3] = READ_MEM_U16(rsp->dmem, (addr + 6) & 0xfff); - } + unsigned addr = rsp->sr[base] + offset * 8; + const unsigned end = (e > 8) ? 16 : (e + 8); + for (unsigned i = e; i < end; i++) + writeByteToHalfWords(rsp->cp2.regs[rt].e, i & 0xf, READ_MEM_U8(rsp->dmem, addr++ & 0xfff)); } // Store 64-bit @@ -168,24 +142,25 @@ extern "C" void RSP_LPV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(LPV); - if (e != 0) - return; - unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff; + const unsigned index = (addr & 7) - e; + addr &= ~7; + auto *reg = rsp->cp2.regs[rt].e; for (unsigned i = 0; i < 8; i++) - reg[i] = READ_MEM_U8(rsp->dmem, (addr + i) & 0xfff) << 8; + reg[i] = READ_MEM_U8(rsp->dmem, (addr + (i + index & 0xf)) & 0xfff) << 8; } void RSP_SPV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(SPV); - if (e != 0) - return; unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff; auto *reg = rsp->cp2.regs[rt].e; - for (unsigned i = 0; i < 8; i++) - WRITE_MEM_U8(rsp->dmem, (addr + i) & 0xfff, int16_t(reg[i]) >> 8); + + for (unsigned i = e; i < e + 8; i++) { + const unsigned shift = ((i & 0xf) < 8) ? 8 : 7; + WRITE_MEM_U8(rsp->dmem, addr++ & 0xfff, int16_t(reg[i & 7]) >> shift); + } } // Load 8x8-bit into high bits, but shift by 7 instead of 8. @@ -195,36 +170,24 @@ extern "C" { TRACE_LS(LUV); unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff; - auto *reg = rsp->cp2.regs[rt].e; + const unsigned index = (addr & 7) - e; + addr &= ~7; - if (e != 0) - { - // Special path for Mia Hamm soccer. - addr += -e & 0xf; - for (unsigned b = 0; b < 8; b++) - { - reg[b] = READ_MEM_U8(rsp->dmem, addr) << 7; - --e; - addr -= e ? 0 : 16; - ++addr; - } - } - else - { - for (unsigned i = 0; i < 8; i++) - reg[i] = READ_MEM_U8(rsp->dmem, (addr + i) & 0xfff) << 7; - } + auto *reg = rsp->cp2.regs[rt].e; + for (unsigned i = 0; i < 8; i++) + reg[i] = READ_MEM_U8(rsp->dmem, (addr + (i + index & 0xf)) & 0xfff) << 7; } void RSP_SUV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(SUV); - if (e != 0) - return; unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff; auto *reg = rsp->cp2.regs[rt].e; - for (unsigned i = 0; i < 8; i++) - WRITE_MEM_U8(rsp->dmem, (addr + i) & 0xfff, int16_t(reg[i]) >> 7); + + for (unsigned i = e; i < e + 8; i++) { + const unsigned shift = ((i & 0xf) < 8) ? 7 : 8; + WRITE_MEM_U8(rsp->dmem, addr++ & 0xfff, int16_t(reg[i & 7]) >> shift); + } } // Load 8x8-bits into high bits, but shift by 7 instead of 8. @@ -232,171 +195,199 @@ extern "C" void RSP_LHV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(LHV); - if (e != 0) - return; - unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff; - if (addr & 0xe) - return; + unsigned addr = rsp->sr[base] + offset * 16; + const unsigned index = (addr & 7) - e; + addr &= ~7; auto *reg = rsp->cp2.regs[rt].e; for (unsigned i = 0; i < 8; i++) - reg[i] = READ_MEM_U8(rsp->dmem, addr + 2 * i) << 7; + reg[i] = (uint16_t)READ_MEM_U8(rsp->dmem, (addr + (index + i * 2 & 0xf)) & 0xfff) << 7; } void RSP_SHV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(SHV); - if (e != 0) - return; unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff; - auto *reg = rsp->cp2.regs[rt].e; + const unsigned index = addr & 7; + addr &= ~7; + + const auto *reg = rsp->cp2.regs[rt].e; for (unsigned i = 0; i < 8; i++) - WRITE_MEM_U8(rsp->dmem, (addr + 2 * i) & 0xfff, int16_t(reg[i]) >> 7); + { + const unsigned b = e + (i << 1); + const uint8_t byte = byteFromHalfWords(reg, b & 0xf) << 1 | byteFromHalfWords(reg, b + 1 & 0xf) >> 7; + WRITE_MEM_U8(rsp->dmem, addr + (index + i * 2 & 0xf), byte); + } + } + + void RSP_LFV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) + { + TRACE_LS(LFV); + uint16_t temp[8]; + + unsigned addr = rsp->sr[base] + offset * 16; + const unsigned index = (addr & 7) - e; + const unsigned end = (e > 8) ? 16 : (e + 8); + addr &= ~7; + + for (unsigned i = 0; i < 4; i++) + { + temp[i] = (uint16_t)READ_MEM_U8(rsp->dmem, (addr + (index + i * 4 & 0xf)) & 0xfff) << 7; + temp[i+4] = (uint16_t)READ_MEM_U8(rsp->dmem, (addr + (index + i * 4 + 8 & 0xf)) & 0xfff) << 7; + } + + for (unsigned i = e; i < end; i++) + writeByteToHalfWords(rsp->cp2.regs[rt].e, i, byteFromHalfWords(temp, i)); } - // No idea what the purpose of this is. +#define RSP_SFV_CASE(a,b,c,d) \ + WRITE_MEM_U8(rsp->dmem, addr + base, int16_t(reg[a]) >> 7); \ + WRITE_MEM_U8(rsp->dmem, addr + 4 + base, int16_t(reg[b]) >> 7); \ + WRITE_MEM_U8(rsp->dmem, addr + (8 + base & 0xf), int16_t(reg[c]) >> 7); \ + WRITE_MEM_U8(rsp->dmem, addr + (12 + base & 0xf), int16_t(reg[d]) >> 7); + void RSP_SFV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(SFV); - unsigned addr = (rsp->sr[base] + offset * 16) & 0xff3; + unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff; + base = addr & 7; + addr &= ~7; + auto *reg = rsp->cp2.regs[rt].e; switch (e) { case 0: - WRITE_MEM_U8(rsp->dmem, (addr + 0) & 0xfff, int16_t(reg[0]) >> 7); - WRITE_MEM_U8(rsp->dmem, (addr + 4) & 0xfff, int16_t(reg[1]) >> 7); - WRITE_MEM_U8(rsp->dmem, (addr + 8) & 0xfff, int16_t(reg[2]) >> 7); - WRITE_MEM_U8(rsp->dmem, (addr + 12) & 0xfff, int16_t(reg[3]) >> 7); + case 15: + RSP_SFV_CASE(0,1,2,3) + break; + case 1: + RSP_SFV_CASE(6,7,4,5) + break; + case 4: + RSP_SFV_CASE(1,2,3,0) + break; + case 5: + RSP_SFV_CASE(7,4,5,6) break; - case 8: - WRITE_MEM_U8(rsp->dmem, (addr + 0) & 0xfff, int16_t(reg[4]) >> 7); - WRITE_MEM_U8(rsp->dmem, (addr + 4) & 0xfff, int16_t(reg[5]) >> 7); - WRITE_MEM_U8(rsp->dmem, (addr + 8) & 0xfff, int16_t(reg[6]) >> 7); - WRITE_MEM_U8(rsp->dmem, (addr + 12) & 0xfff, int16_t(reg[7]) >> 7); + RSP_SFV_CASE(4,5,6,7) + break; + case 11: + RSP_SFV_CASE(3,0,1,2) + break; + case 12: + RSP_SFV_CASE(5,6,7,4) break; - default: + WRITE_MEM_U8(rsp->dmem, addr + base, 0); + WRITE_MEM_U8(rsp->dmem, addr + 4 + base, 0); + WRITE_MEM_U8(rsp->dmem, addr + (8 + base & 0xf), 0); + WRITE_MEM_U8(rsp->dmem, addr + (12 + base & 0xf), 0); break; } } - // Loads full 128-bit register, however, it seems to handle unaligned addresses in a very - // strange way. - void RSP_LQV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) + void RSP_LWV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { - TRACE_LS(LQV); - if (e & 1) - return; - unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff; + TRACE_LS(LWV); + unsigned addr = rsp->sr[base] + offset * 16; + for (unsigned i = 16 - e; i < 16 + e; i++) + { + writeByteToHalfWords(rsp->cp2.regs[rt].e, i & 0xf, READ_MEM_U8(rsp->dmem, addr & 0xfff)); + addr += 4; + } + } -#ifdef INTENSE_DEBUG - fprintf(stderr, "LQV: 0x%x, e = %u, vt = %u, base = %u\n", addr, e, rt, base); -#endif + void RSP_SWV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) + { + TRACE_LS(SWV); - if (addr & 1) - return; + unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff; + base = addr & 7; + addr &= ~7; - unsigned b = (addr & 0xf) >> 1; - e >>= 1; + for (unsigned i = e; i < e + 16; i++) + WRITE_MEM_U8(rsp->dmem, addr + (base++ & 0xf), byteFromHalfWords(rsp->cp2.regs[rt].e, i & 0xf)); + } - auto *reg = rsp->cp2.regs[rt].e; - for (unsigned i = b; i < 8; i++, e++, addr += 2) - reg[e] = READ_MEM_U16(rsp->dmem, addr & 0xfff); + void RSP_LQV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) + { + TRACE_LS(LQV); + unsigned addr = rsp->sr[base] + offset * 16; + unsigned end = 16 + e - (addr & 0xf); + if (end > 16) end = 16; + + for (unsigned i = e; i < end; i++) + writeByteToHalfWords(rsp->cp2.regs[rt].e, i & 0xf, READ_MEM_U8(rsp->dmem, addr++ & 0xfff)); } void RSP_SQV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(SQV); unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff; - if (addr & 1) - return; - - unsigned b = addr & 0xf; - - auto *reg = rsp->cp2.regs[rt].e; - - if (e != 0) - { - // Mia Hamm Soccer - for (unsigned i = 0; i < 16 - b; i++, addr++) - { - WRITE_MEM_U8(rsp->dmem, addr & 0xfff, reinterpret_cast(reg)[MES((e + i) & 0xf)]); - } - } - else - { - b >>= 1; - for (unsigned i = b; i < 8; i++, e++, addr += 2) - WRITE_MEM_U16(rsp->dmem, addr & 0xfff, reg[e]); - } + + const unsigned end = e + (16 - (addr & 15)); + for (unsigned i = e; i < end; i++) + WRITE_MEM_U8(rsp->dmem, addr++, byteFromHalfWords(rsp->cp2.regs[rt].e, i & 15)); } - // Complements LQV? void RSP_LRV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(LRV); - if (e != 0) - return; - unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff; - if (addr & 1) - return; - - unsigned b = (addr & 0xf) >> 1; + unsigned addr = rsp->sr[base] + offset * 16; + const unsigned start = 16 - ((addr & 0xf) - e); addr &= ~0xf; - auto *reg = rsp->cp2.regs[rt].e; - for (e = 8 - b; e < 8; e++, addr += 2) - reg[e] = READ_MEM_U16(rsp->dmem, addr & 0xfff); + for (unsigned i = start; i < 16; i++) + writeByteToHalfWords(rsp->cp2.regs[rt].e, i & 0xf, READ_MEM_U8(rsp->dmem, addr++ & 0xfff)); } void RSP_SRV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(SRV); - if (e != 0) - return; unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff; - if (addr & 1) - return; - - unsigned b = (addr & 0xf) >> 1; + const unsigned end = e + (addr & 0xf); + base = 16 - (addr & 0xf); addr &= ~0xf; - auto *reg = rsp->cp2.regs[rt].e; - for (e = 8 - b; e < 8; e++, addr += 2) - WRITE_MEM_U16(rsp->dmem, addr & 0xfff, reg[e]); + for (unsigned i = e; i < end; i++) + WRITE_MEM_U8(rsp->dmem, addr++, byteFromHalfWords(rsp->cp2.regs[rt].e, i + base & 0xf)); } - // Transposed stuff? void RSP_LTV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(LTV); - if (e & 1) - return; - if (rt & 7) - return; - unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff; - if (addr & 0xf) - return; + unsigned addr = rsp->sr[base] + offset * 16; + const unsigned start = addr & ~7; + const unsigned vt0 = rt & ~7; + addr = start + ((e + (addr & 8)) & 0xf); + unsigned j = e >> 1; - for (unsigned i = 0; i < 8; i++) - rsp->cp2.regs[rt + i].e[(-e / 2 + i) & 7] = READ_MEM_U16(rsp->dmem, addr + 2 * i); + for (unsigned i = 0; i < 16; j++) + { + j &= 7; + writeByteToHalfWords(rsp->cp2.regs[vt0+j].e, i++, READ_MEM_U8(rsp->dmem, addr++ & 0xfff)); + if (addr == start + 16) addr = start; + writeByteToHalfWords(rsp->cp2.regs[vt0+j].e, i++, READ_MEM_U8(rsp->dmem, addr++ & 0xfff)); + if (addr == start + 16) addr = start; + } } void RSP_STV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base) { TRACE_LS(STV); - if (e & 1) - return; - if (rt & 7) - return; + e &= ~1; + rt &= ~7; + unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff; - if (addr & 0xf) - return; + unsigned element = 16 - e; + base = (addr & 7) - e; + addr &= ~7; - for (unsigned i = 0; i < 8; i++) + for (unsigned i = rt; i < rt + 8; i++ ) { - WRITE_MEM_U16(rsp->dmem, addr + 2 * i, rsp->cp2.regs[rt + ((e / 2 + i) & 7)].e[i]); + WRITE_MEM_U8(rsp->dmem, addr + (base++ & 0xf), byteFromHalfWords(rsp->cp2.regs[i].e, element++ & 0xf)); + WRITE_MEM_U8(rsp->dmem, addr + (base++ & 0xf), byteFromHalfWords(rsp->cp2.regs[i].e, element++ & 0xf)); } } } diff --git a/rsp/vfunctions.cpp b/rsp/vfunctions.cpp index c236d81..2f993b2 100644 --- a/rsp/vfunctions.cpp +++ b/rsp/vfunctions.cpp @@ -25,6 +25,13 @@ extern "C" { + static inline int32_t clamp16s(int32_t x) + { + if (x > 0x7fff) return 0x7fff; + if (x < -0x8000) return -0x8000; + return x; + } + // // VABS // @@ -297,6 +304,25 @@ extern "C" STORE_RESULT(); } + void RSP_VMACQ(RSP::CPUState *rsp, unsigned vd, unsigned, unsigned, unsigned) + { + TRACE_VU(VMACQ); + uint16_t *acc = rsp->cp2.acc.e; + for (unsigned i = 0; i < 8; i++) + { + int32_t prod = (int16_t)acc[i] << 16; + prod |= acc[8+i]; + if (prod < 0 && !(prod & 1 << 5)) + prod += 32; + else if (prod >= 32 && !(prod & 1 << 5)) + prod -= 32; + acc[i] = prod >> 16; + acc[8+i] = prod & 0xffffu; + + rsp->cp2.regs[vd].e[i] = clamp16s(prod >> 1) & ~15; + } + } + // // VMADH // VMUDH @@ -464,10 +490,9 @@ extern "C" { TRACE_VU(VMOV); uint16_t *acc = rsp->cp2.acc.e; - unsigned de = vs & 0x7; write_acc_lo(acc, LOAD_VT()); - __m128i result = rsp_vmov(rsp, vt, e, vd, de); - STORE_RESULT(); + vs &= 0x7; + rsp->cp2.regs[vd].e[vs] = rsp->cp2.acc.e[16+vs]; } // @@ -489,6 +514,7 @@ extern "C" // // VMULF + // VMULQ // VMULU // void RSP_VMULF(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e) @@ -505,6 +531,28 @@ extern "C" STORE_RESULT(); } + void RSP_VMULQ(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e) + { + TRACE_VU(VMULQ); + uint16_t *acc = rsp->cp2.acc.e; + uint16_t *vde = rsp->cp2.regs[vd].e; + int16_t *vse = (int16_t*)rsp->cp2.regs[vs].e; + + int16_t vte[8]; + rsp_vect_t vtt = LOAD_VT(); + rsp_vect_write_operand((uint16_t*)vte, vtt); + + for (unsigned i = 0; i < 8; i++) + { + int32_t prod = vse[i] * vte[i]; + if (prod < 0) prod += 31; + acc[i] = prod >> 16; + acc[8+i] = prod & 0xffff; + acc[16+i] = 0; + vde[i] = clamp16s(prod >> 1) & ~15; + } + } + void RSP_VMULU(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e) { TRACE_VU(VMULU); @@ -519,6 +567,52 @@ extern "C" STORE_RESULT(); } + // + // VRNDP + // VRNDN + // + static inline void RSP_VRND(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e, uint_fast8_t variant) + { + int16_t vte[8]; + rsp_vect_t vtt = LOAD_VT(); + rsp_vect_write_operand((uint16_t*)vte, vtt); + uint16_t *acc = rsp->cp2.acc.e; + uint16_t *vde = rsp->cp2.regs[vd].e; + + for (unsigned i = 0; i < 8; i++) + { + int64_t acc48 = + ((int64_t)(int16_t)acc[i] << 32) | + ((int64_t)acc[8+i] << 16) | + (int64_t)acc[16+i]; + + const uint_fast8_t negative_acc = acc48 < 0; + if (!!variant xor !!negative_acc) + { + int64_t value = (int64_t)(int16_t)vte[i]; + if (vs & 1) value <<= 16; + acc48 += value; + } + + acc[i] = (acc48 >> 32) & 0xffff; + acc[8+i] = (acc48 >> 16) & 0xffff; + acc[16+i] = acc48 & 0xffff; + vde[i] = clamp16s((int32_t)(acc48 >> 16)); + } + } + + void RSP_VRNDN(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e) + { + TRACE_VU(RSP_VRNDN); + RSP_VRND(rsp, vd, vs, vt, e, 0); + } + + void RSP_VRNDP(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e) + { + TRACE_VU(RSP_VRNDP); + RSP_VRND(rsp, vd, vs, vt, e, 1); + } + // // VNOP // @@ -563,7 +657,6 @@ extern "C" TRACE_VU(VRCP); uint16_t *acc = rsp->cp2.acc.e; unsigned de = vs & 0x7; - e &= 0x7; write_acc_lo(acc, LOAD_VT()); @@ -577,7 +670,6 @@ extern "C" TRACE_VU(VRCPL); uint16_t *acc = rsp->cp2.acc.e; unsigned de = vs & 0x7; - e &= 0x7; write_acc_lo(acc, LOAD_VT()); @@ -593,7 +685,6 @@ extern "C" TRACE_VU(VRSQ); uint16_t *acc = rsp->cp2.acc.e; unsigned de = vs & 0x7; - e &= 0x7; write_acc_lo(acc, LOAD_VT()); @@ -607,7 +698,6 @@ extern "C" TRACE_VU(VRSQL); uint16_t *acc = rsp->cp2.acc.e; unsigned de = vs & 0x7; - e &= 0x7; write_acc_lo(acc, LOAD_VT()); @@ -627,7 +717,6 @@ extern "C" TRACE_VU(VRCPH); uint16_t *acc = rsp->cp2.acc.e; unsigned de = vs & 0x7; - e &= 0x7; write_acc_lo(acc, LOAD_VT()); @@ -643,7 +732,6 @@ extern "C" TRACE_VU(VRSQH); uint16_t *acc = rsp->cp2.acc.e; unsigned de = vs & 0x7; - e &= 0x7; write_acc_lo(acc, LOAD_VT()); @@ -745,9 +833,13 @@ extern "C" } // RESERVED - void RSP_RESERVED(RSP::CPUState *rsp, unsigned vd, unsigned, unsigned, unsigned) + void RSP_RESERVED(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e) { - rsp_vect_t result = rsp_vzero(); + uint16_t *acc = rsp->cp2.acc.e; + rsp_vect_t result = _mm_add_epi16(LOAD_VS(), LOAD_VT()); + write_acc_lo(acc, result); + + result = rsp_vzero(); STORE_RESULT(); } } diff --git a/rsp_jit.cpp b/rsp_jit.cpp index a0e1c36..2052404 100644 --- a/rsp_jit.cpp +++ b/rsp_jit.cpp @@ -874,12 +874,12 @@ void CPU::jit_instruction(jit_state_t *_jit, uint32_t pc, uint32_t instr, using VUOp = void (*)(RSP::CPUState *, unsigned vd, unsigned vs, unsigned vt, unsigned e); static const VUOp ops[64] = { - RSP_VMULF, RSP_VMULU, nullptr, nullptr, RSP_VMUDL, RSP_VMUDM, RSP_VMUDN, RSP_VMUDH, RSP_VMACF, RSP_VMACU, nullptr, - nullptr, RSP_VMADL, RSP_VMADM, RSP_VMADN, RSP_VMADH, RSP_VADD, RSP_VSUB, nullptr, RSP_VABS, RSP_VADDC, RSP_VSUBC, + RSP_VMULF, RSP_VMULU, RSP_VRNDP, RSP_VMULQ, RSP_VMUDL, RSP_VMUDM, RSP_VMUDN, RSP_VMUDH, RSP_VMACF, RSP_VMACU, RSP_VRNDN, + RSP_VMACQ, RSP_VMADL, RSP_VMADM, RSP_VMADN, RSP_VMADH, RSP_VADD, RSP_VSUB, nullptr, RSP_VABS, RSP_VADDC, RSP_VSUBC, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, RSP_VSAR, nullptr, nullptr, RSP_VLT, RSP_VEQ, RSP_VNE, RSP_VGE, RSP_VCL, RSP_VCH, RSP_VCR, RSP_VMRG, RSP_VAND, RSP_VNAND, RSP_VOR, RSP_VNOR, RSP_VXOR, RSP_VNXOR, nullptr, nullptr, RSP_VRCP, RSP_VRCPL, RSP_VRCPH, RSP_VMOV, RSP_VRSQ, RSP_VRSQL, RSP_VRSQH, - RSP_VNOP, + RSP_VNOP, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, RSP_VNOP }; auto *vuop = ops[op]; @@ -985,6 +985,7 @@ void CPU::jit_instruction(jit_state_t *_jit, uint32_t pc, uint32_t instr, case 007: // SRAV { + NOP_IF_RD_ZERO(); unsigned rt_reg = regs.load_mips_register_sext(_jit, rt); unsigned rs_reg = regs.load_mips_register_noext(_jit, rs); unsigned rs_tmp_reg = regs.modify_mips_register(_jit, RegisterCache::SCRATCH_REGISTER0); @@ -1385,7 +1386,8 @@ void CPU::jit_instruction(jit_state_t *_jit, uint32_t pc, uint32_t instr, case 013: // SLTIU { - TWO_REG_IMM_OP(lti_u, uint16_t, zext); + // SLTIU sign extends the immediate to 32 bit but then does an unsigned comparison + TWO_REG_IMM_OP(lti_u, int16_t, sext); break; } @@ -1422,7 +1424,7 @@ void CPU::jit_instruction(jit_state_t *_jit, uint32_t pc, uint32_t instr, unsigned rt = (instr >> 16) & 31; NOP_IF_RT_ZERO(); int16_t imm = int16_t(instr); - regs.immediate_mips_register(_jit, rt, uint16_t(imm) << 16); + regs.immediate_mips_register(_jit, rt, imm << 16); regs.unlock_mips_register(rt); break; } @@ -1564,6 +1566,7 @@ void CPU::jit_instruction(jit_state_t *_jit, uint32_t pc, uint32_t instr, } case 043: // LW + case 047: // LWU { jit_emit_load_operation(_jit, pc, instr, [](jit_state_t *_jit, unsigned a, unsigned b, unsigned c) { jit_ldxr_i(a, b, c); }, @@ -1636,7 +1639,7 @@ void CPU::jit_instruction(jit_state_t *_jit, uint32_t pc, uint32_t instr, using LWC2Op = void (*)(RSP::CPUState *, unsigned rt, unsigned imm, int simm, unsigned rs); static const LWC2Op ops[32] = { - RSP_LBV, RSP_LSV, RSP_LLV, RSP_LDV, RSP_LQV, RSP_LRV, RSP_LPV, RSP_LUV, RSP_LHV, nullptr, nullptr, RSP_LTV, + RSP_LBV, RSP_LSV, RSP_LLV, RSP_LDV, RSP_LQV, RSP_LRV, RSP_LPV, RSP_LUV, RSP_LHV, RSP_LFV, nullptr, RSP_LTV, }; auto *op = ops[rd]; @@ -1668,7 +1671,7 @@ void CPU::jit_instruction(jit_state_t *_jit, uint32_t pc, uint32_t instr, using SWC2Op = void (*)(RSP::CPUState *, unsigned rt, unsigned imm, int simm, unsigned rs); static const SWC2Op ops[32] = { - RSP_SBV, RSP_SSV, RSP_SLV, RSP_SDV, RSP_SQV, RSP_SRV, RSP_SPV, RSP_SUV, RSP_SHV, RSP_SFV, nullptr, RSP_STV, + RSP_SBV, RSP_SSV, RSP_SLV, RSP_SDV, RSP_SQV, RSP_SRV, RSP_SPV, RSP_SUV, RSP_SHV, RSP_SFV, RSP_SWV, RSP_STV, }; auto *op = ops[rd]; diff --git a/rsp_op.hpp b/rsp_op.hpp index 8c8bd28..29523b8 100644 --- a/rsp_op.hpp +++ b/rsp_op.hpp @@ -30,6 +30,7 @@ extern "C" DECL_LS(LUV); DECL_LS(LHV); DECL_LS(LFV); + DECL_LS(LWV); DECL_LS(LTV); DECL_LS(SBV); @@ -42,17 +43,21 @@ extern "C" DECL_LS(SUV); DECL_LS(SHV); DECL_LS(SFV); + DECL_LS(SWV); DECL_LS(STV); #define DECL_COP2(op) void RSP_##op(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e) DECL_COP2(VMULF); DECL_COP2(VMULU); + DECL_COP2(VRNDP); + DECL_COP2(VMULQ); DECL_COP2(VMUDL); DECL_COP2(VMUDM); DECL_COP2(VMUDN); DECL_COP2(VMUDH); DECL_COP2(VMACF); DECL_COP2(VMACU); + DECL_COP2(VRNDN); DECL_COP2(VMACQ); DECL_COP2(VMADL); DECL_COP2(VMADM); diff --git a/state.hpp b/state.hpp index ce80dae..3822c73 100644 --- a/state.hpp +++ b/state.hpp @@ -37,7 +37,7 @@ enum CP0Registers CP0_REGISTER_SP_STATUS = 4, CP0_REGISTER_DMA_FULL = 5, CP0_REGISTER_DMA_BUSY = 6, - CP0_REGISTER_SP_RESERVED = 7, + CP0_REGISTER_SP_SEMAPHORE = 7, CP0_REGISTER_CMD_START = 8, CP0_REGISTER_CMD_END = 9, CP0_REGISTER_CMD_CURRENT = 10,