Architected flat scratch is added in MI300 which store the scratch base address in dedicated registers rather than in SGPRs. These registers are used by scratch_ instructions. These are flat instruction which explicitly target the private memory aperture. These instructions have a different address calculation than global_ instructions. This change implements architected flat scratch support, fixes the address calculation of scratch_ instructions, and implements decodings for some scratch_ instructions. Previous flat_ instructions which happen to access the private memory aperture have no change in address calculation. Since scratch_ instructions are identical to flat_ instruction except for address calculation, the decodings simply reuse existing flat_ instruction definitions. Change-Id: I1e1d15a2fbcc7a4a678157c35608f4f22b359e21
1497 lines
53 KiB
C++
1497 lines
53 KiB
C++
/*
|
|
* Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
|
|
#define __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
|
|
|
|
#include "arch/amdgpu/vega/gpu_decoder.hh"
|
|
#include "arch/amdgpu/vega/gpu_mem_helpers.hh"
|
|
#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
|
|
#include "arch/amdgpu/vega/operand.hh"
|
|
#include "debug/GPUExec.hh"
|
|
#include "debug/VEGA.hh"
|
|
#include "mem/ruby/system/RubySystem.hh"
|
|
|
|
namespace gem5
|
|
{
|
|
|
|
namespace VegaISA
|
|
{
|
|
struct BufferRsrcDescriptor
|
|
{
|
|
uint64_t baseAddr : 48;
|
|
uint32_t stride : 14;
|
|
uint32_t cacheSwizzle : 1;
|
|
uint32_t swizzleEn : 1;
|
|
uint32_t numRecords : 32;
|
|
uint32_t dstSelX : 3;
|
|
uint32_t dstSelY : 3;
|
|
uint32_t dstSelZ : 3;
|
|
uint32_t dstSelW : 3;
|
|
uint32_t numFmt : 3;
|
|
uint32_t dataFmt : 4;
|
|
uint32_t elemSize : 2;
|
|
uint32_t idxStride : 2;
|
|
uint32_t addTidEn : 1;
|
|
uint32_t atc : 1;
|
|
uint32_t hashEn : 1;
|
|
uint32_t heap : 1;
|
|
uint32_t mType : 3;
|
|
uint32_t type : 2;
|
|
};
|
|
|
|
// --- purely virtual instruction classes ---
|
|
|
|
class Inst_SOP2 : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_SOP2 instData;
|
|
// possible second DWORD
|
|
InstFormat extData;
|
|
uint32_t varSize;
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_SOP2 *);
|
|
}; // Inst_SOP2
|
|
|
|
class Inst_SOPK : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
|
|
~Inst_SOPK();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_SOPK instData;
|
|
// possible second DWORD
|
|
InstFormat extData;
|
|
uint32_t varSize;
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_SOPK *);
|
|
}; // Inst_SOPK
|
|
|
|
class Inst_SOP1 : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
|
|
~Inst_SOP1();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_SOP1 instData;
|
|
// possible second DWORD
|
|
InstFormat extData;
|
|
uint32_t varSize;
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_SOP1 *);
|
|
}; // Inst_SOP1
|
|
|
|
class Inst_SOPC : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
|
|
~Inst_SOPC();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_SOPC instData;
|
|
// possible second DWORD
|
|
InstFormat extData;
|
|
uint32_t varSize;
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_SOPC *);
|
|
}; // Inst_SOPC
|
|
|
|
class Inst_SOPP : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
|
|
~Inst_SOPP();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_SOPP instData;
|
|
}; // Inst_SOPP
|
|
|
|
class Inst_SMEM : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
|
|
~Inst_SMEM();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
/**
|
|
* initiate a memory read access for N dwords
|
|
*/
|
|
template<int N>
|
|
void
|
|
initMemRead(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
|
|
MemCmd::ReadReq);
|
|
}
|
|
|
|
/**
|
|
* initiate a memory write access for N dwords
|
|
*/
|
|
template<int N>
|
|
void
|
|
initMemWrite(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
|
|
MemCmd::WriteReq);
|
|
}
|
|
|
|
/**
|
|
* For normal s_load_dword/s_store_dword instruction addresses.
|
|
*/
|
|
void
|
|
calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr,
|
|
ScalarRegU32 offset)
|
|
{
|
|
Addr vaddr = ((addr.rawData() + offset) & ~0x3);
|
|
gpu_dyn_inst->scalarAddr = vaddr;
|
|
}
|
|
|
|
/**
|
|
* For s_buffer_load_dword/s_buffer_store_dword instruction addresses.
|
|
* The s_buffer instructions use the same buffer resource descriptor
|
|
* as the MUBUF instructions.
|
|
*/
|
|
void
|
|
calcAddr(GPUDynInstPtr gpu_dyn_inst,
|
|
ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
|
|
{
|
|
BufferRsrcDescriptor rsrc_desc;
|
|
ScalarRegU32 clamped_offset(offset);
|
|
std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
|
|
sizeof(BufferRsrcDescriptor));
|
|
|
|
/**
|
|
* The address is clamped if:
|
|
* Stride is zero: clamp if offset >= num_records
|
|
* Stride is non-zero: clamp if offset > (stride * num_records)
|
|
*/
|
|
if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) {
|
|
clamped_offset = rsrc_desc.numRecords;
|
|
} else if (rsrc_desc.stride && offset
|
|
> (rsrc_desc.stride * rsrc_desc.numRecords)) {
|
|
clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
|
|
}
|
|
|
|
Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
|
|
gpu_dyn_inst->scalarAddr = vaddr;
|
|
}
|
|
|
|
// first instruction DWORD
|
|
InFmt_SMEM instData;
|
|
// second instruction DWORD
|
|
InFmt_SMEM_1 extData;
|
|
}; // Inst_SMEM
|
|
|
|
class Inst_VOP2 : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
|
|
~Inst_VOP2();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_VOP2 instData;
|
|
// possible second DWORD
|
|
InstFormat extData;
|
|
uint32_t varSize;
|
|
|
|
template<typename T>
|
|
T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T & src1)
|
|
{
|
|
T src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
|
|
// use copies of original src0, src1, and dest during selecting
|
|
T origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
|
|
T origSrc1(gpuDynInst, instData.VSRC1);
|
|
|
|
src0_sdwa.read();
|
|
origSrc0_sdwa.read();
|
|
origSrc1.read();
|
|
|
|
DPRINTF(VEGA, "Handling %s SRC SDWA. SRC0: register v[%d], "
|
|
"DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, SRC0_SEXT: "
|
|
"%d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, SRC1_SEXT: %d, "
|
|
"SRC1_NEG: %d, SRC1_ABS: %d\n",
|
|
opcode().c_str(), extData.iFmt_VOP_SDWA.SRC0,
|
|
extData.iFmt_VOP_SDWA.DST_SEL, extData.iFmt_VOP_SDWA.DST_U,
|
|
extData.iFmt_VOP_SDWA.CLMP, extData.iFmt_VOP_SDWA.SRC0_SEL,
|
|
extData.iFmt_VOP_SDWA.SRC0_SEXT,
|
|
extData.iFmt_VOP_SDWA.SRC0_NEG, extData.iFmt_VOP_SDWA.SRC0_ABS,
|
|
extData.iFmt_VOP_SDWA.SRC1_SEL,
|
|
extData.iFmt_VOP_SDWA.SRC1_SEXT,
|
|
extData.iFmt_VOP_SDWA.SRC1_NEG,
|
|
extData.iFmt_VOP_SDWA.SRC1_ABS);
|
|
|
|
processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
|
|
src1, origSrc1);
|
|
|
|
return src0_sdwa;
|
|
}
|
|
|
|
template<typename T>
|
|
void sdwaDstHelper(GPUDynInstPtr gpuDynInst, T & vdst)
|
|
{
|
|
T origVdst(gpuDynInst, instData.VDST);
|
|
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (wf->execMask(lane)) {
|
|
origVdst[lane] = vdst[lane]; // keep copy consistent
|
|
}
|
|
}
|
|
|
|
processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
|
|
}
|
|
|
|
template<typename T>
|
|
T dppHelper(GPUDynInstPtr gpuDynInst, T & src1)
|
|
{
|
|
T src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
|
|
src0_dpp.read();
|
|
|
|
DPRINTF(VEGA, "Handling %s SRC DPP. SRC0: register v[%d], "
|
|
"DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, SRC1_ABS: %d, "
|
|
"SRC1_NEG: %d, BC: %d, BANK_MASK: %d, ROW_MASK: %d\n",
|
|
opcode().c_str(), extData.iFmt_VOP_DPP.SRC0,
|
|
extData.iFmt_VOP_DPP.DPP_CTRL, extData.iFmt_VOP_DPP.SRC0_ABS,
|
|
extData.iFmt_VOP_DPP.SRC0_NEG, extData.iFmt_VOP_DPP.SRC1_ABS,
|
|
extData.iFmt_VOP_DPP.SRC1_NEG, extData.iFmt_VOP_DPP.BC,
|
|
extData.iFmt_VOP_DPP.BANK_MASK, extData.iFmt_VOP_DPP.ROW_MASK);
|
|
|
|
processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
|
|
|
|
return src0_dpp;
|
|
}
|
|
|
|
template<typename ConstT, typename T>
|
|
void vop2Helper(GPUDynInstPtr gpuDynInst,
|
|
void (*fOpImpl)(T&, T&, T&, Wavefront*))
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
T src0(gpuDynInst, instData.SRC0);
|
|
T src1(gpuDynInst, instData.VSRC1);
|
|
T vdst(gpuDynInst, instData.VDST);
|
|
|
|
src0.readSrc();
|
|
src1.read();
|
|
|
|
if (isSDWAInst()) {
|
|
T src0_sdwa = sdwaSrcHelper(gpuDynInst, src1);
|
|
fOpImpl(src0_sdwa, src1, vdst, wf);
|
|
sdwaDstHelper(gpuDynInst, vdst);
|
|
} else if (isDPPInst()) {
|
|
T src0_dpp = dppHelper(gpuDynInst, src1);
|
|
fOpImpl(src0_dpp, src1, vdst, wf);
|
|
} else {
|
|
// src0 is unmodified. We need to use the const container
|
|
// type to allow reading scalar operands from src0. Only
|
|
// src0 can index scalar operands. We copy this to vdst
|
|
// temporarily to pass to the lambda so the instruction
|
|
// does not need to write two lambda functions (one for
|
|
// a const src0 and one of a mutable src0).
|
|
ConstT const_src0(gpuDynInst, instData.SRC0);
|
|
const_src0.readSrc();
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
vdst[lane] = const_src0[lane];
|
|
}
|
|
fOpImpl(vdst, src1, vdst, wf);
|
|
}
|
|
|
|
vdst.write();
|
|
}
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_VOP2 *);
|
|
}; // Inst_VOP2
|
|
|
|
class Inst_VOP1 : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
|
|
~Inst_VOP1();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_VOP1 instData;
|
|
// possible second DWORD
|
|
InstFormat extData;
|
|
uint32_t varSize;
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_VOP1 *);
|
|
}; // Inst_VOP1
|
|
|
|
class Inst_VOPC : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
|
|
~Inst_VOPC();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_VOPC instData;
|
|
// possible second DWORD
|
|
InstFormat extData;
|
|
uint32_t varSize;
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_VOPC *);
|
|
}; // Inst_VOPC
|
|
|
|
class Inst_VINTRP : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
|
|
~Inst_VINTRP();
|
|
|
|
int instSize() const override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_VINTRP instData;
|
|
}; // Inst_VINTRP
|
|
|
|
class Inst_VOP3A : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_VOP3A(InFmt_VOP3A*, const std::string &opcode, bool sgpr_dst);
|
|
~Inst_VOP3A();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_VOP3A instData;
|
|
// second instruction DWORD
|
|
InFmt_VOP3_1 extData;
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_VOP3A *);
|
|
/**
|
|
* the v_cmp and readlane instructions in the VOP3
|
|
* encoding are unique because they are the only
|
|
* instructions that use the VDST field to specify
|
|
* a scalar register destination. for VOP3::V_CMP insts
|
|
* VDST specifies the arbitrary SGPR pair used to write
|
|
* VCC. for V_READLANE VDST specifies the SGPR to return
|
|
* the value of the selected lane in the source VGPR
|
|
* from which we are reading.
|
|
*/
|
|
const bool sgprDst;
|
|
}; // Inst_VOP3A
|
|
|
|
class Inst_VOP3B : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_VOP3B(InFmt_VOP3B*, const std::string &opcode);
|
|
~Inst_VOP3B();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_VOP3B instData;
|
|
// second instruction DWORD
|
|
InFmt_VOP3_1 extData;
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_VOP3B *);
|
|
}; // Inst_VOP3B
|
|
|
|
class Inst_VOP3P : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_VOP3P(InFmt_VOP3P*, const std::string &opcode);
|
|
~Inst_VOP3P();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_VOP3P instData;
|
|
// second instruction DWORD
|
|
InFmt_VOP3P_1 extData;
|
|
|
|
template<typename T>
|
|
void vop3pHelper(GPUDynInstPtr gpuDynInst,
|
|
T (*fOpImpl)(T, T, bool))
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
|
|
ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
|
|
VecOperandU32 D(gpuDynInst, instData.VDST);
|
|
|
|
S0.readSrc();
|
|
S1.readSrc();
|
|
|
|
int opLo = instData.OPSEL;
|
|
int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
|
|
int negLo = extData.NEG;
|
|
int negHi = instData.NEG_HI;
|
|
bool clamp = instData.CLMP;
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (wf->execMask(lane)) {
|
|
T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
|
|
word<T>(S1[lane], opHi, negHi, 1),
|
|
clamp);
|
|
T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
|
|
word<T>(S1[lane], opLo, negLo, 1),
|
|
clamp);
|
|
|
|
uint16_t upper_raw =
|
|
*reinterpret_cast<uint16_t*>(&upper_val);
|
|
uint16_t lower_raw =
|
|
*reinterpret_cast<uint16_t*>(&lower_val);
|
|
|
|
D[lane] = upper_raw << 16 | lower_raw;
|
|
}
|
|
}
|
|
|
|
D.write();
|
|
}
|
|
|
|
template<typename T>
|
|
void vop3pHelper(GPUDynInstPtr gpuDynInst,
|
|
T (*fOpImpl)(T, T, T, bool))
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
|
|
ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
|
|
ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
|
|
VecOperandU32 D(gpuDynInst, instData.VDST);
|
|
|
|
S0.readSrc();
|
|
S1.readSrc();
|
|
S2.readSrc();
|
|
|
|
int opLo = instData.OPSEL;
|
|
int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
|
|
int negLo = extData.NEG;
|
|
int negHi = instData.NEG_HI;
|
|
bool clamp = instData.CLMP;
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (wf->execMask(lane)) {
|
|
T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
|
|
word<T>(S1[lane], opHi, negHi, 1),
|
|
word<T>(S2[lane], opHi, negHi, 2),
|
|
clamp);
|
|
T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
|
|
word<T>(S1[lane], opLo, negLo, 1),
|
|
word<T>(S2[lane], opLo, negLo, 2),
|
|
clamp);
|
|
|
|
uint16_t upper_raw =
|
|
*reinterpret_cast<uint16_t*>(&upper_val);
|
|
uint16_t lower_raw =
|
|
*reinterpret_cast<uint16_t*>(&lower_val);
|
|
|
|
D[lane] = upper_raw << 16 | lower_raw;
|
|
}
|
|
}
|
|
|
|
D.write();
|
|
}
|
|
|
|
void
|
|
dotHelper(GPUDynInstPtr gpuDynInst,
|
|
uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
|
|
ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
|
|
ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
|
|
VecOperandU32 D(gpuDynInst, instData.VDST);
|
|
|
|
S0.readSrc();
|
|
S1.readSrc();
|
|
S2.readSrc();
|
|
|
|
// OPSEL[2] and OPSEL_HI2 are unused. Craft two dwords where:
|
|
// dword1[15:0] is upper/lower 16b of src0 based on opsel[0]
|
|
// dword1[31:15] is upper/lower 16b of src0 based on opsel_hi[0]
|
|
// dword2[15:0] is upper/lower 16b of src1 based on opsel[1]
|
|
// dword2[31:15] is upper/lower 16b of src1 based on opsel_hi[1]
|
|
int opLo = instData.OPSEL;
|
|
int opHi = extData.OPSEL_HI;
|
|
int negLo = extData.NEG;
|
|
int negHi = instData.NEG_HI;
|
|
bool clamp = instData.CLMP;
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (wf->execMask(lane)) {
|
|
uint32_t dword1l =
|
|
word<uint16_t>(S0[lane], opLo, negLo, 0);
|
|
uint32_t dword1h =
|
|
word<uint16_t>(S0[lane], opHi, negHi, 0);
|
|
uint32_t dword2l =
|
|
word<uint16_t>(S1[lane], opLo, negLo, 1);
|
|
uint32_t dword2h =
|
|
word<uint16_t>(S1[lane], opHi, negHi, 1);
|
|
|
|
uint32_t dword1 = (dword1h << 16) | dword1l;
|
|
uint32_t dword2 = (dword2h << 16) | dword2l;
|
|
|
|
// Take in two uint32_t dwords and one src2 dword. The
|
|
// function will need to call bits to break up to the
|
|
// correct size and then reinterpret cast to the correct
|
|
// value.
|
|
D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp);
|
|
}
|
|
}
|
|
|
|
D.write();
|
|
}
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_VOP3P *);
|
|
|
|
template<typename T>
|
|
T
|
|
word(uint32_t data, int opSel, int neg, int opSelBit)
|
|
{
|
|
// This method assumes two words packed into a dword
|
|
static_assert(sizeof(T) == 2);
|
|
|
|
bool select = bits(opSel, opSelBit, opSelBit);
|
|
uint16_t raw = select ? bits(data, 31, 16)
|
|
: bits(data, 15, 0);
|
|
|
|
// Apply input modifiers. This may seem odd, but the hardware
|
|
// just flips the MSb instead of doing unary negation.
|
|
bool negate = bits(neg, opSelBit, opSelBit);
|
|
if (negate) {
|
|
raw ^= 0x8000;
|
|
}
|
|
|
|
return *reinterpret_cast<T*>(&raw);
|
|
}
|
|
}; // Inst_VOP3P
|
|
|
|
class Inst_VOP3P_MAI : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_VOP3P_MAI(InFmt_VOP3P_MAI*, const std::string &opcode);
|
|
~Inst_VOP3P_MAI();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_VOP3P_MAI instData;
|
|
// second instruction DWORD
|
|
InFmt_VOP3P_MAI_1 extData;
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_VOP3P_MAI *);
|
|
}; // Inst_VOP3P
|
|
|
|
class Inst_DS : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_DS(InFmt_DS*, const std::string &opcode);
|
|
~Inst_DS();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
template<typename T>
|
|
void
|
|
initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr = gpuDynInst->addr[lane] + offset;
|
|
|
|
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
|
|
= wf->ldsChunk->read<T>(vaddr);
|
|
}
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void
|
|
initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr = gpuDynInst->addr[lane] + offset;
|
|
for (int i = 0; i < N; ++i) {
|
|
(reinterpret_cast<VecElemU32*>(
|
|
gpuDynInst->d_data))[lane * N + i]
|
|
= wf->ldsChunk->read<VecElemU32>(
|
|
vaddr + i*sizeof(VecElemU32));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
void
|
|
initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
|
|
Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
|
|
|
|
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
|
|
= wf->ldsChunk->read<T>(vaddr0);
|
|
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
|
|
= wf->ldsChunk->read<T>(vaddr1);
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
void
|
|
initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr = gpuDynInst->addr[lane] + offset;
|
|
wf->ldsChunk->write<T>(vaddr,
|
|
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
|
|
}
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void
|
|
initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr = gpuDynInst->addr[lane] + offset;
|
|
for (int i = 0; i < N; ++i) {
|
|
wf->ldsChunk->write<VecElemU32>(
|
|
vaddr + i*sizeof(VecElemU32),
|
|
(reinterpret_cast<VecElemU32*>(
|
|
gpuDynInst->d_data))[lane * N + i]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
void
|
|
initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
|
|
Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
|
|
wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
|
|
gpuDynInst->d_data))[lane * 2]);
|
|
wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
|
|
gpuDynInst->d_data))[lane * 2 + 1]);
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
void
|
|
initAtomicAccess(GPUDynInstPtr gpuDynInst, Addr offset)
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr = gpuDynInst->addr[lane] + offset;
|
|
|
|
AtomicOpFunctorPtr amo_op =
|
|
gpuDynInst->makeAtomicOpFunctor<T>(
|
|
&(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
|
|
&(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]);
|
|
|
|
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
|
|
= wf->ldsChunk->atomic<T>(vaddr, std::move(amo_op));
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (wf->execMask(lane)) {
|
|
gpuDynInst->addr.at(lane) = (Addr)addr[lane];
|
|
}
|
|
}
|
|
}
|
|
|
|
// first instruction DWORD
|
|
InFmt_DS instData;
|
|
// second instruction DWORD
|
|
InFmt_DS_1 extData;
|
|
}; // Inst_DS
|
|
|
|
class Inst_MUBUF : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
|
|
~Inst_MUBUF();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
template<typename T>
|
|
void
|
|
initMemRead(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
// temporarily modify exec_mask to supress memory accesses to oob
|
|
// regions. Only issue memory requests for lanes that have their
|
|
// exec_mask set and are not out of bounds.
|
|
VectorMask old_exec_mask = gpuDynInst->exec_mask;
|
|
gpuDynInst->exec_mask &= ~oobMask;
|
|
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
|
|
gpuDynInst->exec_mask = old_exec_mask;
|
|
}
|
|
|
|
|
|
template<int N>
|
|
void
|
|
initMemRead(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
// temporarily modify exec_mask to supress memory accesses to oob
|
|
// regions. Only issue memory requests for lanes that have their
|
|
// exec_mask set and are not out of bounds.
|
|
VectorMask old_exec_mask = gpuDynInst->exec_mask;
|
|
gpuDynInst->exec_mask &= ~oobMask;
|
|
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
|
|
gpuDynInst->exec_mask = old_exec_mask;
|
|
}
|
|
|
|
template<typename T>
|
|
void
|
|
initMemWrite(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
// temporarily modify exec_mask to supress memory accesses to oob
|
|
// regions. Only issue memory requests for lanes that have their
|
|
// exec_mask set and are not out of bounds.
|
|
VectorMask old_exec_mask = gpuDynInst->exec_mask;
|
|
gpuDynInst->exec_mask &= ~oobMask;
|
|
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
|
|
gpuDynInst->exec_mask = old_exec_mask;
|
|
}
|
|
|
|
template<int N>
|
|
void
|
|
initMemWrite(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
// temporarily modify exec_mask to supress memory accesses to oob
|
|
// regions. Only issue memory requests for lanes that have their
|
|
// exec_mask set and are not out of bounds.
|
|
VectorMask old_exec_mask = gpuDynInst->exec_mask;
|
|
gpuDynInst->exec_mask &= ~oobMask;
|
|
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
|
|
gpuDynInst->exec_mask = old_exec_mask;
|
|
}
|
|
|
|
template<typename T>
|
|
void
|
|
initAtomicAccess(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
// temporarily modify exec_mask to supress memory accesses to oob
|
|
// regions. Only issue memory requests for lanes that have their
|
|
// exec_mask set and are not out of bounds.
|
|
VectorMask old_exec_mask = gpuDynInst->exec_mask;
|
|
gpuDynInst->exec_mask &= ~oobMask;
|
|
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
|
|
gpuDynInst->exec_mask = old_exec_mask;
|
|
}
|
|
|
|
void
|
|
injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
// create request and set flags
|
|
gpuDynInst->resetEntireStatusVector();
|
|
gpuDynInst->setStatusVector(0, 1);
|
|
RequestPtr req = std::make_shared<Request>(0, 0, 0,
|
|
gpuDynInst->computeUnit()->
|
|
requestorId(), 0,
|
|
gpuDynInst->wfDynId);
|
|
gpuDynInst->setRequestFlags(req);
|
|
gpuDynInst->computeUnit()->
|
|
injectGlobalMemFence(gpuDynInst, false, req);
|
|
}
|
|
|
|
/**
|
|
* MUBUF insructions calculate their addresses as follows:
|
|
*
|
|
* index = (IDXEN ? vgpr_idx : 0) + (const_add_tid_en ? TID : 0)
|
|
* offset = (OFFEN ? vgpr_off : 0) + inst_off
|
|
*
|
|
* / ====================== LINEAR ADDRESSING ====================== /
|
|
* VADDR = base + sgpr_off + offset + stride * index
|
|
*
|
|
* / ===================== SWIZZLED ADDRESSING ===================== /
|
|
* index_msb = index / const_index_stride
|
|
* index_lsb = index % const_index_stride
|
|
* offset_msb = offset / const_element_size
|
|
* offset_lsb = offset % const_element_size
|
|
* buffer_offset = ((index_msb * stride + offset_msb *
|
|
* const_element_size) * const_index_stride +
|
|
* index_lsb * const_element_size + offset_lsb)
|
|
*
|
|
* VADDR = base + sgpr_off + buffer_offset
|
|
*/
|
|
template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
|
|
void
|
|
calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
|
|
SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
|
|
{
|
|
Addr vaddr = 0;
|
|
Addr base_addr = 0;
|
|
Addr stride = 0;
|
|
Addr buf_idx = 0;
|
|
Addr buf_off = 0;
|
|
Addr buffer_offset = 0;
|
|
BufferRsrcDescriptor rsrc_desc;
|
|
|
|
std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
|
|
sizeof(BufferRsrcDescriptor));
|
|
|
|
base_addr = rsrc_desc.baseAddr;
|
|
|
|
stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
|
|
+ rsrc_desc.stride) : rsrc_desc.stride;
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
vaddr = base_addr + s_offset.rawData();
|
|
/**
|
|
* first we calculate the buffer's index and offset.
|
|
* these will be used for either linear or swizzled
|
|
* buffers.
|
|
*/
|
|
buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
|
|
|
|
buf_off = v_off[lane] + inst_offset;
|
|
|
|
if (rsrc_desc.swizzleEn) {
|
|
Addr idx_stride = 8 << rsrc_desc.idxStride;
|
|
Addr elem_size = 2 << rsrc_desc.elemSize;
|
|
Addr idx_msb = buf_idx / idx_stride;
|
|
Addr idx_lsb = buf_idx % idx_stride;
|
|
Addr off_msb = buf_off / elem_size;
|
|
Addr off_lsb = buf_off % elem_size;
|
|
DPRINTF(VEGA, "mubuf swizzled lane %d: "
|
|
"idx_stride = %llx, elem_size = %llx, "
|
|
"idx_msb = %llx, idx_lsb = %llx, "
|
|
"off_msb = %llx, off_lsb = %llx\n",
|
|
lane, idx_stride, elem_size, idx_msb, idx_lsb,
|
|
off_msb, off_lsb);
|
|
|
|
buffer_offset =(idx_msb * stride + off_msb * elem_size)
|
|
* idx_stride + idx_lsb * elem_size + off_lsb;
|
|
} else {
|
|
buffer_offset = buf_off + stride * buf_idx;
|
|
}
|
|
|
|
|
|
/**
|
|
* Range check behavior causes out of range accesses to
|
|
* to be treated differently. Out of range accesses return
|
|
* 0 for loads and are ignored for stores. For
|
|
* non-formatted accesses, this is done on a per-lane
|
|
* basis.
|
|
*/
|
|
if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
|
|
if (buffer_offset >=
|
|
rsrc_desc.numRecords - s_offset.rawData()) {
|
|
DPRINTF(VEGA, "mubuf out-of-bounds condition 1: "
|
|
"lane = %d, buffer_offset = %llx, "
|
|
"const_stride = %llx, "
|
|
"const_num_records = %llx\n",
|
|
lane, buf_off + stride * buf_idx,
|
|
stride, rsrc_desc.numRecords);
|
|
oobMask.set(lane);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
|
|
if (buf_idx >= rsrc_desc.numRecords ||
|
|
buf_off >= stride) {
|
|
DPRINTF(VEGA, "mubuf out-of-bounds condition 2: "
|
|
"lane = %d, offset = %llx, "
|
|
"index = %llx, "
|
|
"const_num_records = %llx\n",
|
|
lane, buf_off, buf_idx,
|
|
rsrc_desc.numRecords);
|
|
oobMask.set(lane);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
vaddr += buffer_offset;
|
|
|
|
DPRINTF(VEGA, "Calculating mubuf address for lane %d: "
|
|
"vaddr = %llx, base_addr = %llx, "
|
|
"stride = %llx, buf_idx = %llx, buf_off = %llx\n",
|
|
lane, vaddr, base_addr, stride,
|
|
buf_idx, buf_off);
|
|
gpuDynInst->addr.at(lane) = vaddr;
|
|
}
|
|
}
|
|
}
|
|
|
|
// first instruction DWORD
|
|
InFmt_MUBUF instData;
|
|
// second instruction DWORD
|
|
InFmt_MUBUF_1 extData;
|
|
// Mask of lanes with out-of-bounds accesses. Needs to be tracked
|
|
// seperately from the exec_mask so that we remember to write zero
|
|
// to the registers associated with out of bounds lanes.
|
|
VectorMask oobMask;
|
|
}; // Inst_MUBUF
|
|
|
|
class Inst_MTBUF : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
|
|
~Inst_MTBUF();
|
|
|
|
int instSize() const override;
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_MTBUF instData;
|
|
// second instruction DWORD
|
|
InFmt_MTBUF_1 extData;
|
|
|
|
private:
|
|
bool hasSecondDword(InFmt_MTBUF *);
|
|
}; // Inst_MTBUF
|
|
|
|
class Inst_MIMG : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
|
|
~Inst_MIMG();
|
|
|
|
int instSize() const override;
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_MIMG instData;
|
|
// second instruction DWORD
|
|
InFmt_MIMG_1 extData;
|
|
}; // Inst_MIMG
|
|
|
|
class Inst_EXP : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_EXP(InFmt_EXP*, const std::string &opcode);
|
|
~Inst_EXP();
|
|
|
|
int instSize() const override;
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
// first instruction DWORD
|
|
InFmt_EXP instData;
|
|
// second instruction DWORD
|
|
InFmt_EXP_1 extData;
|
|
}; // Inst_EXP
|
|
|
|
class Inst_FLAT : public VEGAGPUStaticInst
|
|
{
|
|
public:
|
|
Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
|
|
~Inst_FLAT();
|
|
|
|
int instSize() const override;
|
|
void generateDisassembly() override;
|
|
|
|
void initOperandInfo() override;
|
|
|
|
protected:
|
|
template<typename T>
|
|
void
|
|
initMemRead(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
|
|
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
|
|
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
|
|
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr = gpuDynInst->addr[lane];
|
|
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
|
|
= wf->ldsChunk->read<T>(vaddr);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void
|
|
initMemRead(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
|
|
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
|
|
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
|
|
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr = gpuDynInst->addr[lane];
|
|
for (int i = 0; i < N; ++i) {
|
|
(reinterpret_cast<VecElemU32*>(
|
|
gpuDynInst->d_data))[lane * N + i]
|
|
= wf->ldsChunk->read<VecElemU32>(
|
|
vaddr + i*sizeof(VecElemU32));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
void
|
|
initMemWrite(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
|
|
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
|
|
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
|
|
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr = gpuDynInst->addr[lane];
|
|
wf->ldsChunk->write<T>(vaddr,
|
|
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void
|
|
initMemWrite(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
|
|
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
|
|
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
|
|
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr = gpuDynInst->addr[lane];
|
|
for (int i = 0; i < N; ++i) {
|
|
wf->ldsChunk->write<VecElemU32>(
|
|
vaddr + i*sizeof(VecElemU32),
|
|
(reinterpret_cast<VecElemU32*>(
|
|
gpuDynInst->d_data))[lane * N + i]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
void
|
|
initAtomicAccess(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
// Flat scratch requests may not be atomic according to ISA manual
|
|
// up to MI200. See MI200 manual Table 45.
|
|
assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
|
|
|
|
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
|
|
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
|
|
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
Addr vaddr = gpuDynInst->addr[lane];
|
|
auto amo_op =
|
|
gpuDynInst->makeAtomicOpFunctor<T>(
|
|
&(reinterpret_cast<T*>(
|
|
gpuDynInst->a_data))[lane],
|
|
&(reinterpret_cast<T*>(
|
|
gpuDynInst->x_data))[lane]);
|
|
|
|
T tmp = wf->ldsChunk->read<T>(vaddr);
|
|
(*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
|
|
wf->ldsChunk->write<T>(vaddr, tmp);
|
|
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
calcAddr(GPUDynInstPtr gpuDynInst, ScalarRegU32 vaddr,
|
|
ScalarRegU32 saddr, ScalarRegI32 offset)
|
|
{
|
|
// Offset is a 13-bit field w/the following meanings:
|
|
// In Flat instructions, offset is a 12-bit unsigned number
|
|
// In Global/Scratch instructions, offset is a 13-bit signed number
|
|
if (isFlat()) {
|
|
offset = offset & 0xfff;
|
|
} else {
|
|
offset = (ScalarRegI32)sext<13>(offset);
|
|
}
|
|
// If saddr = 0x7f there is no scalar reg to read and address will
|
|
// be a 64-bit address. Otherwise, saddr is the reg index for a
|
|
// scalar reg used as the base address for a 32-bit address.
|
|
if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
|
|
ConstVecOperandU64 vbase(gpuDynInst, vaddr);
|
|
vbase.read();
|
|
|
|
calcAddrVgpr(gpuDynInst, vbase, offset);
|
|
} else if (isFlatGlobal()) {
|
|
// Assume we are operating in 64-bit mode and read a pair of
|
|
// SGPRs for the address base.
|
|
ConstScalarOperandU64 sbase(gpuDynInst, saddr);
|
|
sbase.read();
|
|
|
|
ConstVecOperandU32 voffset(gpuDynInst, vaddr);
|
|
voffset.read();
|
|
|
|
calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
|
|
// For scratch, saddr = 0x7f there is no scalar reg to read and
|
|
// a vgpr will be used for address offset. Otherwise, saddr is
|
|
// the sgpr index holding the address offset. For scratch
|
|
// instructions the offset GPR is always 32-bits.
|
|
} else if (saddr != 0x7f) {
|
|
assert(isFlatScratch());
|
|
|
|
ConstScalarOperandU32 soffset(gpuDynInst, saddr);
|
|
soffset.read();
|
|
|
|
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
|
|
|
|
int elemSize;
|
|
auto staticInst = gpuDynInst->staticInstruction();
|
|
if (gpuDynInst->isLoad()) {
|
|
elemSize = staticInst->getOperandSize(2);
|
|
} else {
|
|
assert(gpuDynInst->isStore());
|
|
elemSize = staticInst->getOperandSize(1);
|
|
}
|
|
|
|
unsigned swizzleOffset = soffset.rawData() + offset;
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
gpuDynInst->addr.at(lane) = flat_scratch_addr
|
|
+ swizzle(swizzleOffset, lane, elemSize);
|
|
}
|
|
}
|
|
} else {
|
|
assert(isFlatScratch());
|
|
|
|
ConstVecOperandU32 voffset(gpuDynInst, vaddr);
|
|
voffset.read();
|
|
|
|
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
|
|
|
|
int elemSize;
|
|
auto staticInst = gpuDynInst->staticInstruction();
|
|
if (gpuDynInst->isLoad()) {
|
|
elemSize = staticInst->getOperandSize(2);
|
|
} else {
|
|
assert(gpuDynInst->isStore());
|
|
elemSize = staticInst->getOperandSize(1);
|
|
}
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
gpuDynInst->addr.at(lane) = flat_scratch_addr
|
|
+ swizzle(voffset[lane] + offset, lane, elemSize);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (isFlat()) {
|
|
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
|
|
} else if (isFlatGlobal()) {
|
|
gpuDynInst->staticInstruction()->executed_as =
|
|
enums::SC_GLOBAL;
|
|
} else {
|
|
assert(isFlatScratch());
|
|
gpuDynInst->staticInstruction()->executed_as =
|
|
enums::SC_PRIVATE;
|
|
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
|
|
}
|
|
}
|
|
|
|
void
|
|
issueRequestHelper(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
if ((gpuDynInst->executedAs() == enums::SC_GLOBAL && isFlat())
|
|
|| isFlatGlobal()) {
|
|
gpuDynInst->computeUnit()->globalMemoryPipe
|
|
.issueRequest(gpuDynInst);
|
|
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
|
assert(isFlat());
|
|
gpuDynInst->computeUnit()->localMemoryPipe
|
|
.issueRequest(gpuDynInst);
|
|
} else {
|
|
assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
|
|
gpuDynInst->computeUnit()->globalMemoryPipe
|
|
.issueRequest(gpuDynInst);
|
|
}
|
|
}
|
|
|
|
// Execute for atomics is identical besides the flag set in the
|
|
// constructor, except cmpswap. For cmpswap, the offset to the "cmp"
|
|
// register is needed. For all other operations this offset is zero
|
|
// and implies the atomic is not a cmpswap.
|
|
// RegT defines the type of GPU register (e.g., ConstVecOperandU32)
|
|
// LaneT defines the type of the register elements (e.g., VecElemU32)
|
|
template<typename RegT, typename LaneT, int CmpRegOffset = 0>
|
|
void
|
|
atomicExecute(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
Wavefront *wf = gpuDynInst->wavefront();
|
|
|
|
if (gpuDynInst->exec_mask.none()) {
|
|
wf->decVMemInstsIssued();
|
|
if (isFlat()) {
|
|
wf->decLGKMInstsIssued();
|
|
}
|
|
return;
|
|
}
|
|
|
|
gpuDynInst->execUnitId = wf->execUnitId;
|
|
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
|
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
|
|
|
RegT data(gpuDynInst, extData.DATA);
|
|
RegT cmp(gpuDynInst, extData.DATA + CmpRegOffset);
|
|
|
|
data.read();
|
|
if constexpr (CmpRegOffset) {
|
|
cmp.read();
|
|
}
|
|
|
|
calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
if constexpr (CmpRegOffset) {
|
|
(reinterpret_cast<VecElemU32*>(
|
|
gpuDynInst->x_data))[lane] = data[lane];
|
|
(reinterpret_cast<VecElemU32*>(
|
|
gpuDynInst->a_data))[lane] = cmp[lane];
|
|
} else {
|
|
(reinterpret_cast<LaneT*>(gpuDynInst->a_data))[lane]
|
|
= data[lane];
|
|
}
|
|
}
|
|
}
|
|
|
|
issueRequestHelper(gpuDynInst);
|
|
}
|
|
|
|
// RegT defines the type of GPU register (e.g., ConstVecOperandU32)
|
|
// LaneT defines the type of the register elements (e.g., VecElemU32)
|
|
template<typename RegT, typename LaneT>
|
|
void
|
|
atomicComplete(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
if (isAtomicRet()) {
|
|
RegT vdst(gpuDynInst, extData.VDST);
|
|
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
vdst[lane] = (reinterpret_cast<LaneT*>(
|
|
gpuDynInst->d_data))[lane];
|
|
}
|
|
}
|
|
|
|
vdst.write();
|
|
}
|
|
}
|
|
|
|
bool
|
|
vgprIsOffset()
|
|
{
|
|
return (extData.SADDR != 0x7f);
|
|
}
|
|
|
|
// first instruction DWORD
|
|
InFmt_FLAT instData;
|
|
// second instruction DWORD
|
|
InFmt_FLAT_1 extData;
|
|
|
|
private:
|
|
void initFlatOperandInfo();
|
|
void initGlobalScratchOperandInfo();
|
|
|
|
void generateFlatDisassembly();
|
|
void generateGlobalScratchDisassembly();
|
|
|
|
void
|
|
calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr,
|
|
ConstScalarOperandU64 &saddr, ScalarRegI32 offset)
|
|
{
|
|
// Use SGPR pair as a base address and add VGPR-offset and
|
|
// instruction offset. The VGPR-offset is always 32-bits so we
|
|
// mask any upper bits from the vaddr.
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
ScalarRegI32 voffset = vaddr[lane];
|
|
gpuDynInst->addr.at(lane) =
|
|
saddr.rawData() + voffset + offset;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
calcAddrVgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr,
|
|
ScalarRegI32 offset)
|
|
{
|
|
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
|
if (gpuDynInst->exec_mask[lane]) {
|
|
gpuDynInst->addr.at(lane) = addr[lane] + offset;
|
|
}
|
|
}
|
|
}
|
|
|
|
VecElemU32
|
|
swizzle(VecElemU32 offset, int lane, int elem_size)
|
|
{
|
|
// This is not described in the spec. We use the swizzle from
|
|
// buffer memory instructions and fix the stride to 4. Multiply
|
|
// the thread ID by the storage size to avoid threads clobbering
|
|
// their data.
|
|
return ((offset / 4) * 4 * 64)
|
|
+ (offset % 4) + (lane * elem_size);
|
|
}
|
|
|
|
Addr
|
|
readFlatScratch(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
return gpuDynInst->computeUnit()->shader->getScratchBase();
|
|
}
|
|
}; // Inst_FLAT
|
|
} // namespace VegaISA
|
|
} // namespace gem5
|
|
|
|
#endif // __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
|