diff --git a/models/cpu/iss/flexfloat/flexfloat.c b/models/cpu/iss/flexfloat/flexfloat.c index 1d7900e6..b93ec407 100644 --- a/models/cpu/iss/flexfloat/flexfloat.c +++ b/models/cpu/iss/flexfloat/flexfloat.c @@ -537,6 +537,22 @@ INLINE void ff_add(flexfloat_t *dest, const flexfloat_t *a, const flexfloat_t *b #endif } +#include + +INLINE void ff_exp(flexfloat_t *dest, const flexfloat_t *a, const flexfloat_t *b) { + assert((dest->desc.exp_bits == a->desc.exp_bits) && (dest->desc.frac_bits == a->desc.frac_bits) && + (a->desc.exp_bits == b->desc.exp_bits) && (a->desc.frac_bits == b->desc.frac_bits)); + dest->value = (fp_t)exp((double)a->value); + #ifdef FLEXFLOAT_TRACKING + dest->exact_value = (fp_t)exp((double)a->exact_value); + if(dest->tracking_fn) (dest->tracking_fn)(dest, dest->tracking_arg); + #endif + flexfloat_sanitize(dest); + #ifdef FLEXFLOAT_STATS + if(StatsEnabled) getOpStats(dest->desc)->add += 1; + #endif +} + INLINE void ff_sub(flexfloat_t *dest, const flexfloat_t *a, const flexfloat_t *b) { assert((dest->desc.exp_bits == a->desc.exp_bits) && (dest->desc.frac_bits == a->desc.frac_bits) && (a->desc.exp_bits == b->desc.exp_bits) && (a->desc.frac_bits == b->desc.frac_bits)); diff --git a/models/cpu/iss/flexfloat/flexfloat.h b/models/cpu/iss/flexfloat/flexfloat.h index e7791f10..4f4ca654 100644 --- a/models/cpu/iss/flexfloat/flexfloat.h +++ b/models/cpu/iss/flexfloat/flexfloat.h @@ -231,6 +231,7 @@ long double ff_get_longdouble(const flexfloat_t *obj); void ff_inverse(flexfloat_t *dest, const flexfloat_t *a); void ff_add(flexfloat_t *dest, const flexfloat_t *a, const flexfloat_t *b); +void ff_exp(flexfloat_t *dest, const flexfloat_t *a, const flexfloat_t *b); void ff_sub(flexfloat_t *dest, const flexfloat_t *a, const flexfloat_t *b); void ff_mul(flexfloat_t *dest, const flexfloat_t *a, const flexfloat_t *b); void ff_div(flexfloat_t *dest, const flexfloat_t *a, const flexfloat_t *b); diff --git a/models/cpu/iss/include/cores/snitch/class.hpp b/models/cpu/iss/include/cores/snitch/class.hpp index c6aadf8d..e40f3565 100644 --- a/models/cpu/iss/include/cores/snitch/class.hpp +++ b/models/cpu/iss/include/cores/snitch/class.hpp @@ -98,6 +98,12 @@ class Iss bool snitch; bool fp_ss; + // -----------USE IO PORT TO HANDLE REDMULE------------------ + vp::IoMaster redmule_itf; + vp::IoReq* redmule_req; + uint16_t redmule_mnk_reg [4]; + uint32_t redmule_xwy_reg [4]; + // -----------USE MASTER AND SLAVE PORT TO HANDLE OFFLOAD REQUEST------------------ @@ -199,6 +205,7 @@ static inline iss_reg_t fmode_get(Iss *iss, iss_insn_t *insn) #include "cpu/iss/include/isa/rv32Xfaux.hpp" #include "cpu/iss/include/isa/priv.hpp" #include +#include #include "cpu/iss/include/isa/rv32frep.hpp" #include "cpu/iss/include/isa/rv32ssr.hpp" diff --git a/models/cpu/iss/include/cores/snitch_fp_ss/class.hpp b/models/cpu/iss/include/cores/snitch_fp_ss/class.hpp index 411689f9..6d80ed40 100644 --- a/models/cpu/iss/include/cores/snitch_fp_ss/class.hpp +++ b/models/cpu/iss/include/cores/snitch_fp_ss/class.hpp @@ -97,6 +97,11 @@ class Iss bool snitch; bool fp_ss; + // -----------USE IO PORT TO HANDLE REDMULE------------------ + vp::IoMaster redmule_itf; + vp::IoReq* redmule_req; + uint16_t redmule_mnk_reg [4]; + uint32_t redmule_xwy_reg [4]; // -----------USE MASTER AND SLAVE PORT TO HANDLE OFFLOAD REQUEST------------------ @@ -207,6 +212,7 @@ static inline iss_reg_t fmode_get(Iss *iss, iss_insn_t *insn) #include "cpu/iss/include/isa/rv32Xfaux.hpp" #include "cpu/iss/include/isa/priv.hpp" #include +#include #include "cpu/iss/include/isa/rv32frep.hpp" #include "cpu/iss/include/isa/rv32ssr.hpp" diff --git a/models/cpu/iss/include/isa/redmule.hpp b/models/cpu/iss/include/isa/redmule.hpp new file mode 100644 index 00000000..fde9d634 --- /dev/null +++ b/models/cpu/iss/include/isa/redmule.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2020 GreenWaves Technologies, SAS, ETH Zurich and + * University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cpu/iss/include/iss_core.hpp" +#include "cpu/iss/include/isa_lib/int.h" +#include "cpu/iss/include/isa_lib/macros.h" + +static inline iss_reg_t mcnfig_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc) +{ + uint16_t m_size = REG_GET(0); + uint16_t n_size = REG_GET(1); + uint16_t k_size = (REG_GET(0) >> 16); + + iss->redmule_mnk_reg[0] = m_size; + iss->redmule_mnk_reg[1] = n_size; + iss->redmule_mnk_reg[2] = k_size; + + iss->redmule_req->init(); + iss->redmule_req->set_addr(0); + iss->redmule_req->set_data((uint8_t*)iss->redmule_mnk_reg); + iss->redmule_req->set_size(8); + iss->redmule_itf.req(iss->redmule_req); + + return iss_insn_next(iss, insn, pc); +} + + +static inline iss_reg_t marith_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc) +{ + uint32_t x_addr = REG_GET(0); + uint32_t w_addr = REG_GET(1); + uint32_t y_addr = REG_GET(2); + uint32_t config = UIM_GET(0); + + iss->redmule_xwy_reg[0] = x_addr; + iss->redmule_xwy_reg[1] = w_addr; + iss->redmule_xwy_reg[2] = y_addr; + iss->redmule_xwy_reg[3] = config; + + iss->redmule_req->init(); + iss->redmule_req->set_addr(4); + iss->redmule_req->set_data((uint8_t*)iss->redmule_xwy_reg); + iss->redmule_req->set_size(16); + iss->redmule_itf.req(iss->redmule_req); + + return iss_insn_next(iss, insn, pc); +} diff --git a/models/cpu/iss/include/isa/rv32v.hpp b/models/cpu/iss/include/isa/rv32v.hpp index 188f2877..acde5fc9 100644 --- a/models/cpu/iss/include/isa/rv32v.hpp +++ b/models/cpu/iss/include/isa/rv32v.hpp @@ -20,8 +20,24 @@ //#include "spatz.hpp" #include "cpu/iss/include/isa_lib/vint.h" +static inline iss_reg_t vid_v_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ + LIB_CALL4(lib_VIDV , REG_IN(0), 0, REG_OUT(0), 0); + return iss_insn_next(iss, insn, pc); +} + +static inline iss_reg_t vsll_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ + LIB_CALL4(lib_SLLVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + return iss_insn_next(iss, insn, pc); +} + static inline iss_reg_t vadd_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_ADDVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -37,6 +53,9 @@ static inline iss_reg_t vadd_vi_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vsub_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_SUBVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -57,6 +76,9 @@ static inline iss_reg_t vrsub_vi_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vand_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_ANDVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -72,6 +94,9 @@ static inline iss_reg_t vand_vi_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vor_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_ORVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -87,6 +112,9 @@ static inline iss_reg_t vor_vi_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vxor_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_XORVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -102,6 +130,9 @@ static inline iss_reg_t vxor_vi_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vmin_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_MINVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -112,6 +143,9 @@ static inline iss_reg_t vmin_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vminu_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_MINUVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -122,6 +156,9 @@ static inline iss_reg_t vminu_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vmax_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_MAXVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -132,6 +169,9 @@ static inline iss_reg_t vmax_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vmaxu_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_MAXUVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -142,6 +182,9 @@ static inline iss_reg_t vmaxu_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vmul_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_MULVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vmul_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ @@ -151,6 +194,9 @@ static inline iss_reg_t vmul_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vmulh_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_MULHVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vmulh_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ @@ -160,6 +206,9 @@ static inline iss_reg_t vmulh_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vmulhu_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_MULHUVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vmulhu_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ @@ -169,6 +218,9 @@ static inline iss_reg_t vmulhu_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc) static inline iss_reg_t vmulhsu_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_MULHSUVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vmulhsu_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ @@ -233,6 +285,9 @@ static inline iss_reg_t vwmulsu_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc static inline iss_reg_t vmacc_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_MACCVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -243,6 +298,9 @@ static inline iss_reg_t vmacc_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vmadd_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_MADDVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -253,6 +311,9 @@ static inline iss_reg_t vmadd_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vnmsac_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_NMSACVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -263,6 +324,9 @@ static inline iss_reg_t vnmsac_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc) static inline iss_reg_t vnmsub_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_NMSUBVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -308,41 +372,65 @@ static inline iss_reg_t vwmaccsu_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t p static inline iss_reg_t vredsum_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_REDSUMVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vredand_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_REDANDVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vredor_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_REDORVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vredxor_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_REDXORVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vredmin_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_REDMINVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vredminu_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_REDMINUVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vredmax_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_REDMAXVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vredmaxu_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_REDMAXUVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } @@ -455,34 +543,58 @@ static inline iss_reg_t vremu_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ // } static inline iss_reg_t vle8_v_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL3(lib_VLE8V , REG_GET(0), REG_OUT(0), UIM_GET(0)); + uint64_t delay = iss->spatz.timing_insn(insn, 2, REG_OUT(0), -1, -1, iss->spatz.max_vlsu_latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + iss->spatz.max_vlsu_latency = 0; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vle16_v_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL3(lib_VLE16V , REG_GET(0), REG_OUT(0), UIM_GET(0)); + uint64_t delay = iss->spatz.timing_insn(insn, 2, REG_OUT(0), -1, -1, iss->spatz.max_vlsu_latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + iss->spatz.max_vlsu_latency = 0; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vle32_v_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL3(lib_VLE32V , REG_GET(0), REG_OUT(0), UIM_GET(0)); + uint64_t delay = iss->spatz.timing_insn(insn, 2, REG_OUT(0), -1, -1, iss->spatz.max_vlsu_latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + iss->spatz.max_vlsu_latency = 0; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vle64_v_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL3(lib_VLE64V , REG_GET(0), REG_OUT(0), UIM_GET(0)); + uint64_t delay = iss->spatz.timing_insn(insn, 2, REG_OUT(0), -1, -1, iss->spatz.max_vlsu_latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + iss->spatz.max_vlsu_latency = 0; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vse8_v_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL3(lib_VSE8V , REG_GET(0), REG_OUT(0), UIM_GET(0)); + uint64_t delay = iss->spatz.timing_insn(insn, 2, -1, REG_OUT(0), -1, iss->spatz.max_vlsu_latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + iss->spatz.max_vlsu_latency = 0; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vse16_v_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL3(lib_VSE16V , REG_GET(0), REG_OUT(0), UIM_GET(0)); + uint64_t delay = iss->spatz.timing_insn(insn, 2, -1, REG_OUT(0), -1, iss->spatz.max_vlsu_latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + iss->spatz.max_vlsu_latency = 0; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vse32_v_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL3(lib_VSE32V , REG_GET(0), REG_OUT(0), UIM_GET(0)); + uint64_t delay = iss->spatz.timing_insn(insn, 2, -1, REG_OUT(0), -1, iss->spatz.max_vlsu_latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + iss->spatz.max_vlsu_latency = 0; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vse64_v_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL3(lib_VSE64V , REG_GET(0), REG_OUT(0), UIM_GET(0)); + uint64_t delay = iss->spatz.timing_insn(insn, 2, -1, REG_OUT(0), -1, iss->spatz.max_vlsu_latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + iss->spatz.max_vlsu_latency = 0; return iss_insn_next(iss, insn, pc); } @@ -689,143 +801,282 @@ static inline iss_reg_t vsse64_v_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ static inline iss_reg_t vfadd_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FADDVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfadd_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FADDVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + return iss_insn_next(iss, insn, pc); +} + +static inline iss_reg_t vfexp_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ + LIB_CALL4(lib_FEXPVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + return iss_insn_next(iss, insn, pc); +} +static inline iss_reg_t vfexp_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ + LIB_CALL4(lib_FEXPVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfsub_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FSUBVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfsub_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FSUBVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfrsub_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FRSUBVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmin_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMINVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmin_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMINVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmax_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMAXVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmax_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMAXVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + return iss_insn_next(iss, insn, pc); +} + +static inline iss_reg_t vfdiv_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ + LIB_CALL4(lib_FDIVVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + return iss_insn_next(iss, insn, pc); +} +static inline iss_reg_t vfdiv_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ + LIB_CALL4(lib_FDIVVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmul_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMULVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmul_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMULVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmacc_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMACCVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmacc_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMACCVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfnmacc_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FNMACCVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfnmacc_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FNMACCVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmsac_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMSACVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmsac_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMSACVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfnmsac_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FNMSACVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfnmsac_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FNMSACVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmadd_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMADDVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmadd_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMADDVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfnmadd_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FNMADDVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfnmadd_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FNMADDVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmsub_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMSUBVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfmsub_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FMSUBVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfnmsub_vv_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FNMSUBVV , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfnmsub_vf_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FNMSUBVF , REG_IN(1), FREG_GET(0), REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU); + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfredmax_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FREDMAXVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + return iss_insn_next(iss, insn, pc); +} + +static inline iss_reg_t vfredmax_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ + LIB_CALL4(lib_FREDMAXVX , REG_GET(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfredmin_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FREDMINVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfredsum_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FREDSUMVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; + return iss_insn_next(iss, insn, pc); +} + +static inline iss_reg_t vfredsum_vx_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ + LIB_CALL4(lib_FREDSUMVX , REG_GET(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), -1, REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } static inline iss_reg_t vfredosum_vs_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc){ LIB_CALL4(lib_FREDSUMVS , REG_IN(0), REG_IN(1) , REG_OUT(0), UIM_GET(0)); + uint32_t latency = ((VL + NUM_FPU - 1)/NUM_FPU) - 1 + clog2(NUM_FPU) - 1; + uint64_t delay = iss->spatz.timing_insn(insn, 0, REG_OUT(0), REG_IN(0), REG_IN(1), latency, iss->top.clock.get_cycles()); + iss->exec.stall_cycles = (iss->exec.stall_cycles > delay)? iss->exec.stall_cycles : delay; return iss_insn_next(iss, insn, pc); } diff --git a/models/cpu/iss/include/isa/xdma.hpp b/models/cpu/iss/include/isa/xdma.hpp index 84e01c06..cadf5f31 100644 --- a/models/cpu/iss/include/isa/xdma.hpp +++ b/models/cpu/iss/include/isa/xdma.hpp @@ -109,7 +109,7 @@ static inline iss_reg_t dmcpyi_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc) return iss_insn_next(iss, insn, pc); } -static inline iss_reg_t dmstat_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc) +static inline iss_reg_t dmmask_exec(Iss *iss, iss_insn_t *insn, iss_reg_t pc) { IssOffloadInsn offload_insn = { .opcode=insn->opcode, diff --git a/models/cpu/iss/include/isa_lib/vint.h b/models/cpu/iss/include/isa_lib/vint.h index 26cb748e..dcbd34ce 100644 --- a/models/cpu/iss/include/isa_lib/vint.h +++ b/models/cpu/iss/include/isa_lib/vint.h @@ -31,6 +31,12 @@ #include #include "assert.h" +#include + +inline uint32_t clog2(uint32_t x) { + return (x <= 1) ? 0 : 32 - __builtin_clz(x - 1); +} + #pragma STDC FENV_ACCESS ON @@ -93,6 +99,7 @@ #define LMUL iss->spatz.LMUL_t #define VL iss->csr.vl.value #define VSTART iss->csr.vstart.value +#define NUM_FPU (CONFIG_GVSOC_ISS_SPATZ_FPU * ((64 + SEW - 1) / (SEW))) static inline void printBin(int size, bool *a,const char name[]){ @@ -116,7 +123,8 @@ static inline void printHex(int size, bool *a,const char name[]){ static inline int bin8ToChar(bool *bin,int s, int e){ int c = 0; for(int i = s; i < e;i++){ - c += bin[i]*pow(2,i-s); + int tmp = (bin[i])<<(i-s); + c += tmp; } return c; } @@ -270,7 +278,8 @@ static inline void buildDataInt(Iss *iss, int vs, int i, int64_t* data){ } *data = 0; for(int j = 0;j < iteration;j++){ - *data += temp[j]*pow(2,8*j); + int64_t tmp = (temp[j])<<(8*j); + *data += tmp; } } @@ -300,7 +309,8 @@ static inline void myAbs(Iss *iss, int size, int vs, int i, int64_t* data){ *data = 0; for(int j = 0;j < iteration;j++){ - *data += temp[j]*(int64_t)pow(2,8*j); + int64_t tmp = (temp[j])<<(8*j); + *data += tmp; } *data += cin; *data = cin?(-*data):*data; @@ -316,21 +326,23 @@ static inline void myAbsU(Iss *iss,int size, int vs, int i, uint64_t* data){ *data = 0; for(int j = 0;j < iteration;j++){ - *data += temp[j]*(uint64_t)pow(2,8*j); + uint64_t tmp = (temp[j])<<(8*j); + *data += tmp; } } static inline void writeToVReg(Iss *iss, int size, int vd, int i, bool *bin){ int iteration = size/8; for(int j = 0; j < iteration; j++){ - iss->spatz.vregfile.vregs[vd][i*iteration+j] = bin8ToChar(bin,8*j,8*(j+1)); + iss->spatz.vregfile.vregs[vd][i*iteration+j] = bin8ToChar(bin,8*j,8*(j+1)); } } static inline void binToInt(int size, bool* dataIn, int64_t *res){ *res = 0; for (int j = 0; j < size; j++) { - *res += dataIn[j]*(int64_t)pow(2,j); + int64_t tmp = (dataIn[j])<<(j); + *res += tmp; } } @@ -582,9 +594,44 @@ INLINE void ff_sgnj(flexfloat_t *dest, const flexfloat_t *a,const flexfloat_t *b } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +static inline void lib_VIDV (Iss *iss, int vs2, int64_t rs1, int vd, bool vm){ + + for (int i = VSTART; i < VL; i++){ + + int32_t val = i; + bool resBin[64]; + intToBin(SEW, abs(val), resBin); + + writeToVReg(iss, SEW, vd, i, resBin); + + } +} +static inline void lib_SLLVV (Iss *iss, int vs1, int vs2 , int vd, bool vm){ + int64_t data1, data2, res; + bool bin[8]; + bool resBin[64]; + for (int i = VSTART; i < VL; i++){ + if(!(i%8)){ + intToBin(8,(int64_t) iss->spatz.vregfile.vregs[0][i/8],bin); + } + + myAbs(iss, SEW, vs1, i, &data1); + myAbs(iss, SEW, vs2, i, &data2); + res = data2 << data1; + + intToBin(SEW, abs(res), resBin); + if(res < 0){ + twosComplement(SEW, resBin); + } + + if(!mask(vm,bin)){ + writeToVReg(iss, SEW, vd, i, resBin); + } + } +} static inline void lib_ADDVV (Iss *iss, int vs1, int vs2 , int vd, bool vm){ @@ -3418,6 +3465,56 @@ static inline void lib_FADDVF (Iss *iss, int vs2, int64_t rs1, int vd, bool vm } } +static inline void lib_FEXPVV (Iss *iss, int vs1, int vs2, int vd, bool vm){ + bool bin[8]; + unsigned long int res, data1, data2; + uint8_t e, m; + bool resBin[64]; + + for (int i = VSTART; i < VL; i++){ + if(!(i%8)){ + intToBin(8,(int64_t) iss->spatz.vregfile.vregs[0][i/8],bin); + } + + myAbsU(iss, SEW, vs1, i, &data1); + myAbsU(iss, SEW, vs2, i, &data2); + EMCase(SEW, &m, &e); + + if(!mask(vm,bin)){ + int old = setFFRoundingMode(iss, iss->csr.fcsr.frm); + FLOAT_EXEC_2(ff_exp, data1, data2, e, m, res); + restoreFFRoundingMode(old); + intToBinU(SEW, res, resBin); + writeToVReg(iss, SEW, vd, i, resBin); + } + } +} + +static inline void lib_FEXPVF (Iss *iss, int vs2, int64_t rs1, int vd, bool vm){ + bool bin[8]; + unsigned long int res, data1, data2; + uint8_t e, m; + bool resBin[64]; + data1 = rs1; + for (int i = VSTART; i < VL; i++){ + if(!(i%8)){ + intToBin(8,(int64_t) iss->spatz.vregfile.vregs[0][i/8],bin); + } + + myAbsU(iss, SEW, vs2, i, &data2); + EMCase(SEW, &m, &e); + + if(!mask(vm,bin)){ + int old = setFFRoundingMode(iss, iss->csr.fcsr.frm); + FLOAT_EXEC_2(ff_exp, data1, data2, e, m, res); + restoreFFRoundingMode(old); + intToBinU(SEW, res, resBin); + writeToVReg(iss, SEW, vd, i, resBin); + + } + } +} + static inline void lib_FSUBVV (Iss *iss, int vs1, int vs2, int vd, bool vm){ bool bin[8]; unsigned long int res, data1, data2; @@ -3591,6 +3688,55 @@ static inline void lib_FMAXVF (Iss *iss, int vs2, int64_t rs1, int vd, bool vm } } +static inline void lib_FDIVVV (Iss *iss, int vs1, int vs2, int vd, bool vm){ + bool bin[8]; + unsigned long int res, data1, data2; + uint8_t e, m; + bool resBin[64]; + + for (int i = VSTART; i < VL; i++){ + if(!(i%8)){ + intToBin(8,(int64_t) iss->spatz.vregfile.vregs[0][i/8],bin); + } + + myAbsU(iss, SEW, vs1, i, &data1); + myAbsU(iss, SEW, vs2, i, &data2); + EMCase(SEW, &m, &e); + if(!mask(vm,bin)){ + int old = setFFRoundingMode(iss, iss->csr.fcsr.frm); + FLOAT_EXEC_2(ff_div, data2, data1, e, m, res); + + restoreFFRoundingMode(old); + intToBinU(SEW, res, resBin); + writeToVReg(iss, SEW, vd, i, resBin); + } + } +} + +static inline void lib_FDIVVF (Iss *iss, int vs2, int64_t rs1, int vd, bool vm){ + bool bin[8]; + unsigned long int res, data1, data2; + uint8_t e, m; + bool resBin[64]; + data1 = rs1; + for (int i = VSTART; i < VL; i++){ + if(!(i%8)){ + intToBin(8,(int64_t) iss->spatz.vregfile.vregs[0][i/8],bin); + } + + myAbsU(iss, SEW, vs2, i, &data2); + EMCase(SEW, &m, &e); + + if(!mask(vm,bin)){ + int old = setFFRoundingMode(iss, iss->csr.fcsr.frm); + FLOAT_EXEC_2(ff_div, data2, data1, e, m, res); + restoreFFRoundingMode(old); + intToBinU(SEW, res, resBin); + writeToVReg(iss, SEW, vd, i, resBin); + } + } +} + static inline void lib_FMULVV (Iss *iss, int vs1, int vs2, int vd, bool vm){ bool bin[8]; unsigned long int res, data1, data2; @@ -4170,6 +4316,30 @@ static inline void lib_FREDMAXVS(Iss *iss, int vs1, int vs2, int vd, bool vm writeToVReg(iss, SEW, vd, 0, resBin); } +static inline void lib_FREDMAXVX(Iss *iss, int64_t rs1, int vs2, int vd, bool vm){ + bool bin[8]; + unsigned long int res, data1, data2; + uint8_t e, m; + bool resBin[64]; + myAbsU(iss, SEW, vd, rs1, &data1); + res = data1; + for (int i = VSTART; i < VL; i++){ + if(!(i%8)){ + intToBin(8,(int64_t) iss->spatz.vregfile.vregs[0][i/8],bin); + } + myAbsU(iss, SEW, vs2, i, &data2); + EMCase(SEW, &m, &e); + + if(!mask(vm,bin)){ + int old = setFFRoundingMode(iss, iss->csr.fcsr.frm); + FLOAT_EXEC_2(ff_max, data2, res, e, m, res); + restoreFFRoundingMode(old); + } + } + intToBinU(SEW, res, resBin); + writeToVReg(iss, SEW, vd, rs1, resBin); +} + static inline void lib_FREDMINVS(Iss *iss, int vs1, int vs2, int vd, bool vm){ bool bin[8]; unsigned long int res, data1, data2; @@ -4221,6 +4391,32 @@ static inline void lib_FREDSUMVS(Iss *iss, int vs1, int vs2, int vd, bool vm writeToVReg(iss, SEW, vd, 0, resBin); } +static inline void lib_FREDSUMVX(Iss *iss, int64_t rs1, int vs2, int vd, bool vm){ + bool bin[8]; + unsigned long int res, data1, data2; + uint8_t e, m; + bool resBin[64]; + myAbsU(iss, SEW, vd, rs1, &data1); + res = data1; + for (int i = VSTART; i < VL; i++){ + if(!(i%8)){ + intToBin(8,(int64_t) iss->spatz.vregfile.vregs[0][i/8],bin); + } + myAbsU(iss, SEW, vs2, i, &data2); + EMCase(SEW, &m, &e); + + if(!mask(vm,bin)){ + int old = setFFRoundingMode(iss, iss->csr.fcsr.frm); + FLOAT_EXEC_2(ff_add, data2, res, e, m, res); + // printf("res = %f\n",ff_res.value); + restoreFFRoundingMode(old); + } + } + intToBinU(SEW, res, resBin); + + writeToVReg(iss, SEW, vd, rs1, resBin); +} + static inline void lib_FWADDVV (Iss *iss, int vs1, int vs2, int vd, bool vm){ bool bin[8]; unsigned long int res, data1, data2; @@ -5584,8 +5780,6 @@ inline void Vlsu::handle_pending_io_access(Iss *iss) uint32_t addr = this->io_pending_addr; uint32_t addr_aligned = addr & ~(4 - 1); int size = addr_aligned + 4 - addr; - // printf("size = %d\n" , size); - // printf("io_pending_size = %d\n" , this->io_pending_size); if (size > this->io_pending_size){ size = this->io_pending_size; } @@ -5600,9 +5794,10 @@ inline void Vlsu::handle_pending_io_access(Iss *iss) this->io_pending_size -= size; this->io_pending_addr += size; - int err = this->io_itf[0].req(req); + int err = this->io_itf[this->next_io].req(req); if (err == vp::IO_REQ_OK){ - // this->event->enqueue(this->io_req.get_latency() + 1); + // printf("[Spatz IO] port = %d, latency = %d\n", this->next_io, this->io_req.get_latency()); + iss->spatz.max_vlsu_latency = (iss->spatz.max_vlsu_latency > this->io_req.get_latency())? iss->spatz.max_vlsu_latency : this->io_req.get_latency(); } else if (err == vp::IO_REQ_INVALID){ this->waiting_io_response = false; @@ -5611,6 +5806,9 @@ inline void Vlsu::handle_pending_io_access(Iss *iss) else{ } + + //Update next io port + this->next_io = (this->next_io + 1) % CONFIG_GVSOC_ISS_SPATZ_VLSU; } else{ this->waiting_io_response = false; @@ -6943,6 +7141,7 @@ static inline iss_reg_t lib_VSETVL(Iss *iss, int idxRs1, int idxRd, int rs1, int }else{ AVL = VL; } + return VL; } } diff --git a/models/cpu/iss/include/spatz.hpp b/models/cpu/iss/include/spatz.hpp index 8b348cd0..3a95941f 100644 --- a/models/cpu/iss/include/spatz.hpp +++ b/models/cpu/iss/include/spatz.hpp @@ -29,9 +29,9 @@ typedef uint8_t iss_Vel_t; #define ISS_NB_VREGS 32 //#define NB_VEL VLEN/SEW //#define NB_VEL 256/8 -#define NB_VEL 2048/8//??????????????????????? +#define NB_VEL 16384/8//??????????????????????? //#define VLMAX NB_VEL*iss->spatz.LMUL -#define VLMAX (int)((2048*LMUL)/SEW) +#define VLMAX (int)((16384*LMUL)/SEW) #define XLEN = ISS_REG_WIDTH #define FLEN = ISS_REG_WIDTH @@ -86,7 +86,7 @@ class Vlsu { Vlsu(Iss &iss); void build(); - vp::IoMaster io_itf[4]; + vp::IoMaster io_itf[CONFIG_GVSOC_ISS_SPATZ_VLSU]; vp::IoReq io_req; vp::ClockEvent *event; int io_retval; @@ -95,6 +95,7 @@ class Vlsu { uint8_t *io_pending_data; bool io_pending_is_write; bool waiting_io_response; + uint64_t next_io; private: Iss &iss; @@ -133,6 +134,15 @@ class Spatz VRegfile vregfile; Vlsu vlsu; + vp::Trace trace; + uint64_t last_stamp; + uint64_t runtime; + + //Scoreboard + uint64_t reg_score_board [ISS_NB_VREGS]; + uint64_t unit_score_board [3]; //fpu, slide, vlsu + uint64_t max_vlsu_latency; + uint64_t timing_insn(iss_insn_t *insn, int uint_id, int vd, int vs1, int vs2, uint64_t latency, uint64_t timestamp); }; diff --git a/models/cpu/iss/isa_gen/isa_rvv.py b/models/cpu/iss/isa_gen/isa_rvv.py index 3c39da9e..d213aa35 100644 --- a/models/cpu/iss/isa_gen/isa_rvv.py +++ b/models/cpu/iss/isa_gen/isa_rvv.py @@ -81,6 +81,8 @@ class Rv32v(IsaSubset): def __init__(self): super().__init__(name='v', instrs=[ + Instr('vsll.vv' , Format_OPV , '100101 - ----- ----- 000 ----- 1010111'), + Instr('vadd.vv' , Format_OPV , '000000 - ----- ----- 000 ----- 1010111'),#inst[25] = VM , VM = 0 mask enable Instr('vadd.vi' , Format_OPIVI, '000000 - ----- ----- 011 ----- 1010111'), Instr('vadd.vx' , Format_OPV , '000000 - ----- ----- 100 ----- 1010111'), @@ -201,84 +203,92 @@ def __init__(self): - Instr('vfadd.vv' , Format_OPV , '000000 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfadd.vf' , Format_OPVF , '000000 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfadd.vv' , Format_OPV , '000000 - ----- ----- 001 ----- 1010111'), + Instr('vfadd.vf' , Format_OPVF , '000000 - ----- ----- 101 ----- 1010111'), + + Instr('vfexp.vv' , Format_OPV , '001100 - ----- ----- 001 ----- 1010111'), + Instr('vfexp.vf' , Format_OPVF , '001100 - ----- ----- 101 ----- 1010111'), + + Instr('vfsub.vv' , Format_OPV , '000010 - ----- ----- 001 ----- 1010111'), + Instr('vfsub.vf' , Format_OPVF , '000010 - ----- ----- 101 ----- 1010111'), - Instr('vfsub.vv' , Format_OPV , '000010 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfsub.vf' , Format_OPVF , '000010 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfrsub.vf' , Format_OPVF , '100111 - ----- ----- 101 ----- 1010111'), - Instr('vfrsub.vf' , Format_OPVF , '100111 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfmin.vv' , Format_OPV , '000100 - ----- ----- 001 ----- 1010111'), + Instr('vfmin.vf' , Format_OPVF , '000100 - ----- ----- 101 ----- 1010111'), - Instr('vfmin.vv' , Format_OPV , '000100 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfmin.vf' , Format_OPVF , '000100 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfmax.vv' , Format_OPV , '000110 - ----- ----- 001 ----- 1010111'), + Instr('vfmax.vf' , Format_OPVF , '000110 - ----- ----- 101 ----- 1010111'), - Instr('vfmax.vv' , Format_OPV , '000110 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfmax.vf' , Format_OPVF , '000110 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfdiv.vv' , Format_OPV , '100000 - ----- ----- 001 ----- 1010111'), + Instr('vfdiv.vf' , Format_OPVF , '100000 - ----- ----- 101 ----- 1010111'), - Instr('vfmul.vv' , Format_OPV , '100100 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfmul.vf' , Format_OPVF , '100100 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfmul.vv' , Format_OPV , '100100 - ----- ----- 001 ----- 1010111'), + Instr('vfmul.vf' , Format_OPVF , '100100 - ----- ----- 101 ----- 1010111'), - Instr('vfmacc.vv' , Format_OPV , '101100 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfmacc.vf' , Format_OPVF , '101100 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfmacc.vv' , Format_OPV , '101100 - ----- ----- 001 ----- 1010111'), + Instr('vfmacc.vf' , Format_OPVF , '101100 - ----- ----- 101 ----- 1010111'), - Instr('vfnmacc.vv' , Format_OPV , '101101 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfnmacc.vf' , Format_OPVF , '101101 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfnmacc.vv' , Format_OPV , '101101 - ----- ----- 001 ----- 1010111'), + Instr('vfnmacc.vf' , Format_OPVF , '101101 - ----- ----- 101 ----- 1010111'), - Instr('vfmsac.vv' , Format_OPV , '101110 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfmsac.vf' , Format_OPVF , '101110 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfmsac.vv' , Format_OPV , '101110 - ----- ----- 001 ----- 1010111'), + Instr('vfmsac.vf' , Format_OPVF , '101110 - ----- ----- 101 ----- 1010111'), - Instr('vfnmsac.vv' , Format_OPV , '101111 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfnmsac.vf' , Format_OPVF , '101111 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfnmsac.vv' , Format_OPV , '101111 - ----- ----- 001 ----- 1010111'), + Instr('vfnmsac.vf' , Format_OPVF , '101111 - ----- ----- 101 ----- 1010111'), - Instr('vfmadd.vv' , Format_OPV , '101000 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfmadd.vf' , Format_OPVF , '101000 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfmadd.vv' , Format_OPV , '101000 - ----- ----- 001 ----- 1010111'), + Instr('vfmadd.vf' , Format_OPVF , '101000 - ----- ----- 101 ----- 1010111'), - Instr('vfnmadd.vv' , Format_OPV , '101001 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfnmadd.vf' , Format_OPVF , '101001 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfnmadd.vv' , Format_OPV , '101001 - ----- ----- 001 ----- 1010111'), + Instr('vfnmadd.vf' , Format_OPVF , '101001 - ----- ----- 101 ----- 1010111'), - Instr('vfmsub.vv' , Format_OPV , '101010 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfmsub.vf' , Format_OPVF , '101010 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfmsub.vv' , Format_OPV , '101010 - ----- ----- 001 ----- 1010111'), + Instr('vfmsub.vf' , Format_OPVF , '101010 - ----- ----- 101 ----- 1010111'), - Instr('vfnmsub.vv' , Format_OPV , '101011 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfnmsub.vf' , Format_OPVF , '101011 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfnmsub.vv' , Format_OPV , '101011 - ----- ----- 001 ----- 1010111'), + Instr('vfnmsub.vf' , Format_OPVF , '101011 - ----- ----- 101 ----- 1010111'), - Instr('vfredmax.vs' , Format_OPV , '000111 - ----- ----- 001 ----- 1010111', tags=['fp_op']), + Instr('vfredmax.vs' , Format_OPV , '000111 - ----- ----- 001 ----- 1010111'), + Instr('vfredmax.vx' , Format_OPV , '000111 - ----- ----- 011 ----- 1010111'), - Instr('vfredmin.vs' , Format_OPV , '000101 - ----- ----- 001 ----- 1010111', tags=['fp_op']), + Instr('vfredmin.vs' , Format_OPV , '000101 - ----- ----- 001 ----- 1010111'), - Instr('vfredsum.vs' , Format_OPV , '000001 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfredosum.vs' , Format_OPV , '000011 - ----- ----- 001 ----- 1010111', tags=['fp_op']), + Instr('vfredsum.vs' , Format_OPV , '000001 - ----- ----- 001 ----- 1010111'), + Instr('vfredsum.vx' , Format_OPV , '000001 - ----- ----- 011 ----- 1010111'), + Instr('vfredosum.vs' , Format_OPV , '000011 - ----- ----- 001 ----- 1010111'), - Instr('vfwadd.vv' , Format_OPV , '110000 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfwadd.vf' , Format_OPVF , '110000 - ----- ----- 101 ----- 1010111', tags=['fp_op']), - Instr('vfwadd.wv' , Format_OPV , '110100 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfwadd.wf' , Format_OPVF , '110100 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfwadd.vv' , Format_OPV , '110000 - ----- ----- 001 ----- 1010111'), + Instr('vfwadd.vf' , Format_OPVF , '110000 - ----- ----- 101 ----- 1010111'), + Instr('vfwadd.wv' , Format_OPV , '110100 - ----- ----- 001 ----- 1010111'), + Instr('vfwadd.wf' , Format_OPVF , '110100 - ----- ----- 101 ----- 1010111'), - Instr('vfwsub.vv' , Format_OPV , '110010 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfwsub.vf' , Format_OPVF , '110010 - ----- ----- 101 ----- 1010111', tags=['fp_op']), - Instr('vfwsub.wv' , Format_OPV , '110110 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfwsub.wf' , Format_OPVF , '110110 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfwsub.vv' , Format_OPV , '110010 - ----- ----- 001 ----- 1010111'), + Instr('vfwsub.vf' , Format_OPVF , '110010 - ----- ----- 101 ----- 1010111'), + Instr('vfwsub.wv' , Format_OPV , '110110 - ----- ----- 001 ----- 1010111'), + Instr('vfwsub.wf' , Format_OPVF , '110110 - ----- ----- 101 ----- 1010111'), - Instr('vfwmul.vv' , Format_OPV , '111000 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfwmul.vf' , Format_OPVF , '111000 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfwmul.vv' , Format_OPV , '111000 - ----- ----- 001 ----- 1010111'), + Instr('vfwmul.vf' , Format_OPVF , '111000 - ----- ----- 101 ----- 1010111'), - Instr('vfwmacc.vv' , Format_OPV , '111100 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfwmacc.vf' , Format_OPVF , '111100 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfwmacc.vv' , Format_OPV , '111100 - ----- ----- 001 ----- 1010111'), + Instr('vfwmacc.vf' , Format_OPVF , '111100 - ----- ----- 101 ----- 1010111'), - Instr('vfwmsac.vv' , Format_OPV , '111110 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfwmsac.vf' , Format_OPVF , '111110 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfwmsac.vv' , Format_OPV , '111110 - ----- ----- 001 ----- 1010111'), + Instr('vfwmsac.vf' , Format_OPVF , '111110 - ----- ----- 101 ----- 1010111'), - Instr('vfwnmsac.vv' , Format_OPV , '111111 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfwnmsac.vf' , Format_OPVF , '111111 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfwnmsac.vv' , Format_OPV , '111111 - ----- ----- 001 ----- 1010111'), + Instr('vfwnmsac.vf' , Format_OPVF , '111111 - ----- ----- 101 ----- 1010111'), - Instr('vfsgnj.vv' , Format_OPV , '001000 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfsgnj.vf' , Format_OPVF , '001000 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfsgnj.vv' , Format_OPV , '001000 - ----- ----- 001 ----- 1010111'), + Instr('vfsgnj.vf' , Format_OPVF , '001000 - ----- ----- 101 ----- 1010111'), - Instr('vfsgnjn.vv' , Format_OPV , '001001 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfsgnjn.vf' , Format_OPVF , '001001 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfsgnjn.vv' , Format_OPV , '001001 - ----- ----- 001 ----- 1010111'), + Instr('vfsgnjn.vf' , Format_OPVF , '001001 - ----- ----- 101 ----- 1010111'), - Instr('vfsgnjx.vv' , Format_OPV , '001010 - ----- ----- 001 ----- 1010111', tags=['fp_op']), - Instr('vfsgnjx.vf' , Format_OPVF , '001010 - ----- ----- 101 ----- 1010111', tags=['fp_op']), + Instr('vfsgnjx.vv' , Format_OPV , '001010 - ----- ----- 001 ----- 1010111'), + Instr('vfsgnjx.vf' , Format_OPVF , '001010 - ----- ----- 101 ----- 1010111'), Instr('vfcvt.xu.f.v' , Format_OPV , '010010 - ----- 00000 001 ----- 1010111', tags=['fp_op', 'nseq']), @@ -374,6 +384,9 @@ def __init__(self): Instr('vs2r.v' , Format_OPV , '001 0 001 01000 ----- 000 ----- 0100111'),# vd, (rs1), vm Instr('vs4r.v' , Format_OPV , '011 0 001 01000 ----- 000 ----- 0100111'),# vd, (rs1), vm Instr('vs8r.v' , Format_OPV , '111 0 001 01000 ----- 000 ----- 0100111'),# vd, (rs1), vm + + # bowwang + Instr('vid.v' , Format_OPIVI , '010100 - ----- ----- 010 ----- 1010111'), # V 1.0 Instr('vsetvli' , Format_OPVLI, '- ----------- ----- 111 ----- 1010111'), # zimm = {3'b000,vma,vta,vsew[2:0],vlmul[2:0]} diff --git a/models/cpu/iss/src/snitch/decode.cpp b/models/cpu/iss/src/snitch/decode.cpp index c3e6d53a..c3d5a588 100644 --- a/models/cpu/iss/src/snitch/decode.cpp +++ b/models/cpu/iss/src/snitch/decode.cpp @@ -530,7 +530,7 @@ int Decode::decode_opcode(iss_insn_t *insn, iss_reg_t pc, iss_opcode_t opcode) static iss_reg_t iss_exec_insn_illegal(Iss *iss, iss_insn_t *insn, iss_reg_t pc) { - iss->decode.trace.msg("Executing illegal instruction\n"); + iss->decode.trace.fatal("Executing illegal instruction (pc: 0x%lx, opcode: 0x%lx)\n", pc, insn->opcode); iss->exception.raise(pc, ISS_EXCEPT_ILLEGAL); return pc; } diff --git a/models/cpu/iss/src/snitch/iss.cpp b/models/cpu/iss/src/snitch/iss.cpp index 8feaf0ae..3aaa3fb5 100644 --- a/models/cpu/iss/src/snitch/iss.cpp +++ b/models/cpu/iss/src/snitch/iss.cpp @@ -64,6 +64,9 @@ void IssWrapper::reset(bool active) this->iss.gdbserver.reset(active); this->iss.syscalls.reset(active); this->iss.ssr.reset(active); +#if defined(CONFIG_GVSOC_ISS_INC_SPATZ) + this->iss.spatz.reset(active); +#endif } IssWrapper::IssWrapper(vp::ComponentConf &config) @@ -86,6 +89,9 @@ IssWrapper::IssWrapper(vp::ComponentConf &config) this->iss.exception.build(); this->iss.prefetcher.build(); this->iss.ssr.build(); +#if defined(CONFIG_GVSOC_ISS_INC_SPATZ) + this->iss.spatz.build(); +#endif traces.new_trace("wrapper", &this->trace, vp::DEBUG); } diff --git a/models/cpu/iss/src/snitch/snitch.cpp b/models/cpu/iss/src/snitch/snitch.cpp index 4d6a7b01..c821a4fb 100644 --- a/models/cpu/iss/src/snitch/snitch.cpp +++ b/models/cpu/iss/src/snitch/snitch.cpp @@ -50,6 +50,9 @@ Iss::Iss(IssWrapper &top) this->fp_ss = false; this->top.traces.new_trace("offload", &this->trace_iss, vp::DEBUG); + // -----------USE IO PORT TO HANDLE REDMULE------------------ + this->top.new_master_port("redmule_itf", &this->redmule_itf); + this->redmule_req = this->redmule_itf.req_new(0, 0, 0, 0); // -----------USE MASTER AND SLAVE PORT TO HANDLE OFFLOAD REQUEST------------------ this->event = this->top.event_new((vp::Block *)this, handle_event); diff --git a/models/cpu/iss/src/spatz.cpp b/models/cpu/iss/src/spatz.cpp index e4cb6c51..bfd15777 100644 --- a/models/cpu/iss/src/spatz.cpp +++ b/models/cpu/iss/src/spatz.cpp @@ -8,8 +8,17 @@ Spatz::Spatz(Iss &iss) - : vregfile(iss), vlsu(iss) + : vregfile(iss), vlsu(iss), last_stamp(0), runtime(0), max_vlsu_latency(0) { + iss.top.traces.new_trace("spatz/trace", &this->trace, vp::DEBUG); + for (int i = 0; i < ISS_NB_VREGS; ++i) + { + this->reg_score_board[i] = 0; + } + for (int i = 0; i < 3; ++i) + { + this->unit_score_board[i] = 0; + } } void Spatz::build() @@ -25,6 +34,43 @@ VRegfile::VRegfile(Iss &iss) : iss(iss){ VRegfile::reset(true); } +uint64_t Spatz::timing_insn(iss_insn_t *insn, int uint_id, int vd, int vs1, int vs2, uint64_t latency, uint64_t timestamp) +{ + uint64_t meet_point = timestamp; + uint64_t future_point = 0; + uint64_t delay = 0; + if (uint_id >= 0) + { + meet_point = (this->unit_score_board[uint_id] > meet_point)? this->unit_score_board[uint_id] : meet_point; + } + + if (vs1 >= 0) + { + meet_point = (this->reg_score_board[vs1] > meet_point)? this->reg_score_board[vs1] : meet_point; + } + + if (vs2 >= 0) + { + meet_point = (this->reg_score_board[vs2] > meet_point)? this->reg_score_board[vs2] : meet_point; + } + + delay = meet_point - timestamp; + future_point = meet_point + latency; + + uint64_t add_runtime = (this->last_stamp >= future_point)? 0: (this->last_stamp >= meet_point)? future_point - this->last_stamp : future_point - meet_point; + this->runtime += add_runtime; + this->last_stamp = future_point; + this->trace.msg("[Spatz] Finished : %0d ns ---> %0d ns | period = %0d ns | runtime = %0d ns | Instruction = %s\n", meet_point, future_point, future_point - meet_point, this->runtime, insn->decoder_item->u.insn.label); + + this->unit_score_board[uint_id] = future_point; + if (vd >= 0) + { + this->reg_score_board[vd] = future_point; + } + + return delay; +} + void Vlsu::data_response(vp::Block *__this, vp::IoReq *req) @@ -34,17 +80,17 @@ void Vlsu::data_response(vp::Block *__this, vp::IoReq *req) Vlsu::Vlsu(Iss &iss) : iss(iss) { + this->next_io = 0; } void Vlsu::build() { - for (int i=0; i<4; i++) + for (int i=0; iio_itf[i].set_resp_meth(&Vlsu::data_response); this->iss.top.new_master_port("vlsu_" + std::to_string(i), &this->io_itf[i], (vp::Block *)this); } - - + this->next_io = 0; } inline void VRegfile::reset(bool active){ if (active){ @@ -56,9 +102,9 @@ inline void VRegfile::reset(bool active){ } } -Iss::Iss(IssWrapper &top) - : prefetcher(*this), exec(top, *this), insn_cache(*this), decode(*this), timing(*this), core(*this), irq(*this), - gdbserver(*this), lsu(*this), dbgunit(*this), syscalls(top, *this), trace(*this), csr(*this), - regfile(top, *this), mmu(*this), pmp(*this), exception(*this), spatz(*this), memcheck(top, *this), top(top) -{ -} +// Iss::Iss(IssWrapper &top) +// : prefetcher(*this), exec(top, *this), insn_cache(*this), decode(*this), timing(*this), core(*this), irq(*this), +// gdbserver(*this), lsu(*this), dbgunit(*this), syscalls(top, *this), trace(*this), csr(*this), +// regfile(top, *this), mmu(*this), pmp(*this), exception(*this), spatz(*this), memcheck(top, *this), top(top) +// { +// }