Files
gem5/src/arch/amdgpu/vega/insts/op_encodings.hh
Matthew Poremba c1803eafac arch-vega: Architected flat scratch and scratch insts
Architected flat scratch is added in MI300 which store the scratch base
address in dedicated registers rather than in SGPRs. These registers are
used by scratch_ instructions. These are flat instruction which
explicitly target the private memory aperture. These instructions have a
different address calculation than global_ instructions.

This change implements architected flat scratch support, fixes the
address calculation of scratch_ instructions, and implements decodings
for some scratch_ instructions. Previous flat_ instructions which happen
to access the private memory aperture have no change in address
calculation. Since scratch_ instructions are identical to flat_
instruction except for address calculation, the decodings simply reuse
existing flat_ instruction definitions.

Change-Id: I1e1d15a2fbcc7a4a678157c35608f4f22b359e21
2024-05-16 09:23:03 -07:00

1497 lines
53 KiB
C++

/*
* Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
#define __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
#include "arch/amdgpu/vega/gpu_decoder.hh"
#include "arch/amdgpu/vega/gpu_mem_helpers.hh"
#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
#include "arch/amdgpu/vega/operand.hh"
#include "debug/GPUExec.hh"
#include "debug/VEGA.hh"
#include "mem/ruby/system/RubySystem.hh"
namespace gem5
{
namespace VegaISA
{
struct BufferRsrcDescriptor
{
uint64_t baseAddr : 48;
uint32_t stride : 14;
uint32_t cacheSwizzle : 1;
uint32_t swizzleEn : 1;
uint32_t numRecords : 32;
uint32_t dstSelX : 3;
uint32_t dstSelY : 3;
uint32_t dstSelZ : 3;
uint32_t dstSelW : 3;
uint32_t numFmt : 3;
uint32_t dataFmt : 4;
uint32_t elemSize : 2;
uint32_t idxStride : 2;
uint32_t addTidEn : 1;
uint32_t atc : 1;
uint32_t hashEn : 1;
uint32_t heap : 1;
uint32_t mType : 3;
uint32_t type : 2;
};
// --- purely virtual instruction classes ---
class Inst_SOP2 : public VEGAGPUStaticInst
{
public:
Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_SOP2 instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_SOP2 *);
}; // Inst_SOP2
class Inst_SOPK : public VEGAGPUStaticInst
{
public:
Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
~Inst_SOPK();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_SOPK instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_SOPK *);
}; // Inst_SOPK
class Inst_SOP1 : public VEGAGPUStaticInst
{
public:
Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
~Inst_SOP1();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_SOP1 instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_SOP1 *);
}; // Inst_SOP1
class Inst_SOPC : public VEGAGPUStaticInst
{
public:
Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
~Inst_SOPC();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_SOPC instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_SOPC *);
}; // Inst_SOPC
class Inst_SOPP : public VEGAGPUStaticInst
{
public:
Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
~Inst_SOPP();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_SOPP instData;
}; // Inst_SOPP
class Inst_SMEM : public VEGAGPUStaticInst
{
public:
Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
~Inst_SMEM();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
/**
* initiate a memory read access for N dwords
*/
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
MemCmd::ReadReq);
}
/**
* initiate a memory write access for N dwords
*/
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
MemCmd::WriteReq);
}
/**
* For normal s_load_dword/s_store_dword instruction addresses.
*/
void
calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr,
ScalarRegU32 offset)
{
Addr vaddr = ((addr.rawData() + offset) & ~0x3);
gpu_dyn_inst->scalarAddr = vaddr;
}
/**
* For s_buffer_load_dword/s_buffer_store_dword instruction addresses.
* The s_buffer instructions use the same buffer resource descriptor
* as the MUBUF instructions.
*/
void
calcAddr(GPUDynInstPtr gpu_dyn_inst,
ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
{
BufferRsrcDescriptor rsrc_desc;
ScalarRegU32 clamped_offset(offset);
std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
sizeof(BufferRsrcDescriptor));
/**
* The address is clamped if:
* Stride is zero: clamp if offset >= num_records
* Stride is non-zero: clamp if offset > (stride * num_records)
*/
if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) {
clamped_offset = rsrc_desc.numRecords;
} else if (rsrc_desc.stride && offset
> (rsrc_desc.stride * rsrc_desc.numRecords)) {
clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
}
Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
gpu_dyn_inst->scalarAddr = vaddr;
}
// first instruction DWORD
InFmt_SMEM instData;
// second instruction DWORD
InFmt_SMEM_1 extData;
}; // Inst_SMEM
class Inst_VOP2 : public VEGAGPUStaticInst
{
public:
Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
~Inst_VOP2();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOP2 instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
template<typename T>
T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T & src1)
{
T src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
// use copies of original src0, src1, and dest during selecting
T origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
T origSrc1(gpuDynInst, instData.VSRC1);
src0_sdwa.read();
origSrc0_sdwa.read();
origSrc1.read();
DPRINTF(VEGA, "Handling %s SRC SDWA. SRC0: register v[%d], "
"DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, SRC0_SEXT: "
"%d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, SRC1_SEXT: %d, "
"SRC1_NEG: %d, SRC1_ABS: %d\n",
opcode().c_str(), extData.iFmt_VOP_SDWA.SRC0,
extData.iFmt_VOP_SDWA.DST_SEL, extData.iFmt_VOP_SDWA.DST_U,
extData.iFmt_VOP_SDWA.CLMP, extData.iFmt_VOP_SDWA.SRC0_SEL,
extData.iFmt_VOP_SDWA.SRC0_SEXT,
extData.iFmt_VOP_SDWA.SRC0_NEG, extData.iFmt_VOP_SDWA.SRC0_ABS,
extData.iFmt_VOP_SDWA.SRC1_SEL,
extData.iFmt_VOP_SDWA.SRC1_SEXT,
extData.iFmt_VOP_SDWA.SRC1_NEG,
extData.iFmt_VOP_SDWA.SRC1_ABS);
processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
src1, origSrc1);
return src0_sdwa;
}
template<typename T>
void sdwaDstHelper(GPUDynInstPtr gpuDynInst, T & vdst)
{
T origVdst(gpuDynInst, instData.VDST);
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
origVdst[lane] = vdst[lane]; // keep copy consistent
}
}
processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
}
template<typename T>
T dppHelper(GPUDynInstPtr gpuDynInst, T & src1)
{
T src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
src0_dpp.read();
DPRINTF(VEGA, "Handling %s SRC DPP. SRC0: register v[%d], "
"DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, SRC1_ABS: %d, "
"SRC1_NEG: %d, BC: %d, BANK_MASK: %d, ROW_MASK: %d\n",
opcode().c_str(), extData.iFmt_VOP_DPP.SRC0,
extData.iFmt_VOP_DPP.DPP_CTRL, extData.iFmt_VOP_DPP.SRC0_ABS,
extData.iFmt_VOP_DPP.SRC0_NEG, extData.iFmt_VOP_DPP.SRC1_ABS,
extData.iFmt_VOP_DPP.SRC1_NEG, extData.iFmt_VOP_DPP.BC,
extData.iFmt_VOP_DPP.BANK_MASK, extData.iFmt_VOP_DPP.ROW_MASK);
processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
return src0_dpp;
}
template<typename ConstT, typename T>
void vop2Helper(GPUDynInstPtr gpuDynInst,
void (*fOpImpl)(T&, T&, T&, Wavefront*))
{
Wavefront *wf = gpuDynInst->wavefront();
T src0(gpuDynInst, instData.SRC0);
T src1(gpuDynInst, instData.VSRC1);
T vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.read();
if (isSDWAInst()) {
T src0_sdwa = sdwaSrcHelper(gpuDynInst, src1);
fOpImpl(src0_sdwa, src1, vdst, wf);
sdwaDstHelper(gpuDynInst, vdst);
} else if (isDPPInst()) {
T src0_dpp = dppHelper(gpuDynInst, src1);
fOpImpl(src0_dpp, src1, vdst, wf);
} else {
// src0 is unmodified. We need to use the const container
// type to allow reading scalar operands from src0. Only
// src0 can index scalar operands. We copy this to vdst
// temporarily to pass to the lambda so the instruction
// does not need to write two lambda functions (one for
// a const src0 and one of a mutable src0).
ConstT const_src0(gpuDynInst, instData.SRC0);
const_src0.readSrc();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
vdst[lane] = const_src0[lane];
}
fOpImpl(vdst, src1, vdst, wf);
}
vdst.write();
}
private:
bool hasSecondDword(InFmt_VOP2 *);
}; // Inst_VOP2
class Inst_VOP1 : public VEGAGPUStaticInst
{
public:
Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
~Inst_VOP1();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOP1 instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_VOP1 *);
}; // Inst_VOP1
class Inst_VOPC : public VEGAGPUStaticInst
{
public:
Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
~Inst_VOPC();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOPC instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_VOPC *);
}; // Inst_VOPC
class Inst_VINTRP : public VEGAGPUStaticInst
{
public:
Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
~Inst_VINTRP();
int instSize() const override;
protected:
// first instruction DWORD
InFmt_VINTRP instData;
}; // Inst_VINTRP
class Inst_VOP3A : public VEGAGPUStaticInst
{
public:
Inst_VOP3A(InFmt_VOP3A*, const std::string &opcode, bool sgpr_dst);
~Inst_VOP3A();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOP3A instData;
// second instruction DWORD
InFmt_VOP3_1 extData;
private:
bool hasSecondDword(InFmt_VOP3A *);
/**
* the v_cmp and readlane instructions in the VOP3
* encoding are unique because they are the only
* instructions that use the VDST field to specify
* a scalar register destination. for VOP3::V_CMP insts
* VDST specifies the arbitrary SGPR pair used to write
* VCC. for V_READLANE VDST specifies the SGPR to return
* the value of the selected lane in the source VGPR
* from which we are reading.
*/
const bool sgprDst;
}; // Inst_VOP3A
class Inst_VOP3B : public VEGAGPUStaticInst
{
public:
Inst_VOP3B(InFmt_VOP3B*, const std::string &opcode);
~Inst_VOP3B();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOP3B instData;
// second instruction DWORD
InFmt_VOP3_1 extData;
private:
bool hasSecondDword(InFmt_VOP3B *);
}; // Inst_VOP3B
class Inst_VOP3P : public VEGAGPUStaticInst
{
public:
Inst_VOP3P(InFmt_VOP3P*, const std::string &opcode);
~Inst_VOP3P();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOP3P instData;
// second instruction DWORD
InFmt_VOP3P_1 extData;
template<typename T>
void vop3pHelper(GPUDynInstPtr gpuDynInst,
T (*fOpImpl)(T, T, bool))
{
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
VecOperandU32 D(gpuDynInst, instData.VDST);
S0.readSrc();
S1.readSrc();
int opLo = instData.OPSEL;
int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
int negLo = extData.NEG;
int negHi = instData.NEG_HI;
bool clamp = instData.CLMP;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
word<T>(S1[lane], opHi, negHi, 1),
clamp);
T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
word<T>(S1[lane], opLo, negLo, 1),
clamp);
uint16_t upper_raw =
*reinterpret_cast<uint16_t*>(&upper_val);
uint16_t lower_raw =
*reinterpret_cast<uint16_t*>(&lower_val);
D[lane] = upper_raw << 16 | lower_raw;
}
}
D.write();
}
template<typename T>
void vop3pHelper(GPUDynInstPtr gpuDynInst,
T (*fOpImpl)(T, T, T, bool))
{
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
VecOperandU32 D(gpuDynInst, instData.VDST);
S0.readSrc();
S1.readSrc();
S2.readSrc();
int opLo = instData.OPSEL;
int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
int negLo = extData.NEG;
int negHi = instData.NEG_HI;
bool clamp = instData.CLMP;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
word<T>(S1[lane], opHi, negHi, 1),
word<T>(S2[lane], opHi, negHi, 2),
clamp);
T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
word<T>(S1[lane], opLo, negLo, 1),
word<T>(S2[lane], opLo, negLo, 2),
clamp);
uint16_t upper_raw =
*reinterpret_cast<uint16_t*>(&upper_val);
uint16_t lower_raw =
*reinterpret_cast<uint16_t*>(&lower_val);
D[lane] = upper_raw << 16 | lower_raw;
}
}
D.write();
}
void
dotHelper(GPUDynInstPtr gpuDynInst,
uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
{
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
VecOperandU32 D(gpuDynInst, instData.VDST);
S0.readSrc();
S1.readSrc();
S2.readSrc();
// OPSEL[2] and OPSEL_HI2 are unused. Craft two dwords where:
// dword1[15:0] is upper/lower 16b of src0 based on opsel[0]
// dword1[31:15] is upper/lower 16b of src0 based on opsel_hi[0]
// dword2[15:0] is upper/lower 16b of src1 based on opsel[1]
// dword2[31:15] is upper/lower 16b of src1 based on opsel_hi[1]
int opLo = instData.OPSEL;
int opHi = extData.OPSEL_HI;
int negLo = extData.NEG;
int negHi = instData.NEG_HI;
bool clamp = instData.CLMP;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
uint32_t dword1l =
word<uint16_t>(S0[lane], opLo, negLo, 0);
uint32_t dword1h =
word<uint16_t>(S0[lane], opHi, negHi, 0);
uint32_t dword2l =
word<uint16_t>(S1[lane], opLo, negLo, 1);
uint32_t dword2h =
word<uint16_t>(S1[lane], opHi, negHi, 1);
uint32_t dword1 = (dword1h << 16) | dword1l;
uint32_t dword2 = (dword2h << 16) | dword2l;
// Take in two uint32_t dwords and one src2 dword. The
// function will need to call bits to break up to the
// correct size and then reinterpret cast to the correct
// value.
D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp);
}
}
D.write();
}
private:
bool hasSecondDword(InFmt_VOP3P *);
template<typename T>
T
word(uint32_t data, int opSel, int neg, int opSelBit)
{
// This method assumes two words packed into a dword
static_assert(sizeof(T) == 2);
bool select = bits(opSel, opSelBit, opSelBit);
uint16_t raw = select ? bits(data, 31, 16)
: bits(data, 15, 0);
// Apply input modifiers. This may seem odd, but the hardware
// just flips the MSb instead of doing unary negation.
bool negate = bits(neg, opSelBit, opSelBit);
if (negate) {
raw ^= 0x8000;
}
return *reinterpret_cast<T*>(&raw);
}
}; // Inst_VOP3P
class Inst_VOP3P_MAI : public VEGAGPUStaticInst
{
public:
Inst_VOP3P_MAI(InFmt_VOP3P_MAI*, const std::string &opcode);
~Inst_VOP3P_MAI();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOP3P_MAI instData;
// second instruction DWORD
InFmt_VOP3P_MAI_1 extData;
private:
bool hasSecondDword(InFmt_VOP3P_MAI *);
}; // Inst_VOP3P
class Inst_DS : public VEGAGPUStaticInst
{
public:
Inst_DS(InFmt_DS*, const std::string &opcode);
~Inst_DS();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
template<typename T>
void
initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
= wf->ldsChunk->read<T>(vaddr);
}
}
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
for (int i = 0; i < N; ++i) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]
= wf->ldsChunk->read<VecElemU32>(
vaddr + i*sizeof(VecElemU32));
}
}
}
}
template<typename T>
void
initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
= wf->ldsChunk->read<T>(vaddr0);
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
= wf->ldsChunk->read<T>(vaddr1);
}
}
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
wf->ldsChunk->write<T>(vaddr,
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
}
}
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
for (int i = 0; i < N; ++i) {
wf->ldsChunk->write<VecElemU32>(
vaddr + i*sizeof(VecElemU32),
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]);
}
}
}
}
template<typename T>
void
initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
gpuDynInst->d_data))[lane * 2]);
wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
gpuDynInst->d_data))[lane * 2 + 1]);
}
}
}
template<typename T>
void
initAtomicAccess(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
AtomicOpFunctorPtr amo_op =
gpuDynInst->makeAtomicOpFunctor<T>(
&(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
&(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]);
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
= wf->ldsChunk->atomic<T>(vaddr, std::move(amo_op));
}
}
}
void
calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
gpuDynInst->addr.at(lane) = (Addr)addr[lane];
}
}
}
// first instruction DWORD
InFmt_DS instData;
// second instruction DWORD
InFmt_DS_1 extData;
}; // Inst_DS
class Inst_MUBUF : public VEGAGPUStaticInst
{
public:
Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
~Inst_MUBUF();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
template<typename T>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
// temporarily modify exec_mask to supress memory accesses to oob
// regions. Only issue memory requests for lanes that have their
// exec_mask set and are not out of bounds.
VectorMask old_exec_mask = gpuDynInst->exec_mask;
gpuDynInst->exec_mask &= ~oobMask;
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
gpuDynInst->exec_mask = old_exec_mask;
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
// temporarily modify exec_mask to supress memory accesses to oob
// regions. Only issue memory requests for lanes that have their
// exec_mask set and are not out of bounds.
VectorMask old_exec_mask = gpuDynInst->exec_mask;
gpuDynInst->exec_mask &= ~oobMask;
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
gpuDynInst->exec_mask = old_exec_mask;
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
// temporarily modify exec_mask to supress memory accesses to oob
// regions. Only issue memory requests for lanes that have their
// exec_mask set and are not out of bounds.
VectorMask old_exec_mask = gpuDynInst->exec_mask;
gpuDynInst->exec_mask &= ~oobMask;
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
gpuDynInst->exec_mask = old_exec_mask;
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
// temporarily modify exec_mask to supress memory accesses to oob
// regions. Only issue memory requests for lanes that have their
// exec_mask set and are not out of bounds.
VectorMask old_exec_mask = gpuDynInst->exec_mask;
gpuDynInst->exec_mask &= ~oobMask;
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
gpuDynInst->exec_mask = old_exec_mask;
}
template<typename T>
void
initAtomicAccess(GPUDynInstPtr gpuDynInst)
{
// temporarily modify exec_mask to supress memory accesses to oob
// regions. Only issue memory requests for lanes that have their
// exec_mask set and are not out of bounds.
VectorMask old_exec_mask = gpuDynInst->exec_mask;
gpuDynInst->exec_mask &= ~oobMask;
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
gpuDynInst->exec_mask = old_exec_mask;
}
void
injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
{
// create request and set flags
gpuDynInst->resetEntireStatusVector();
gpuDynInst->setStatusVector(0, 1);
RequestPtr req = std::make_shared<Request>(0, 0, 0,
gpuDynInst->computeUnit()->
requestorId(), 0,
gpuDynInst->wfDynId);
gpuDynInst->setRequestFlags(req);
gpuDynInst->computeUnit()->
injectGlobalMemFence(gpuDynInst, false, req);
}
/**
* MUBUF insructions calculate their addresses as follows:
*
* index = (IDXEN ? vgpr_idx : 0) + (const_add_tid_en ? TID : 0)
* offset = (OFFEN ? vgpr_off : 0) + inst_off
*
* / ====================== LINEAR ADDRESSING ====================== /
* VADDR = base + sgpr_off + offset + stride * index
*
* / ===================== SWIZZLED ADDRESSING ===================== /
* index_msb = index / const_index_stride
* index_lsb = index % const_index_stride
* offset_msb = offset / const_element_size
* offset_lsb = offset % const_element_size
* buffer_offset = ((index_msb * stride + offset_msb *
* const_element_size) * const_index_stride +
* index_lsb * const_element_size + offset_lsb)
*
* VADDR = base + sgpr_off + buffer_offset
*/
template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
void
calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
{
Addr vaddr = 0;
Addr base_addr = 0;
Addr stride = 0;
Addr buf_idx = 0;
Addr buf_off = 0;
Addr buffer_offset = 0;
BufferRsrcDescriptor rsrc_desc;
std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
sizeof(BufferRsrcDescriptor));
base_addr = rsrc_desc.baseAddr;
stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
+ rsrc_desc.stride) : rsrc_desc.stride;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vaddr = base_addr + s_offset.rawData();
/**
* first we calculate the buffer's index and offset.
* these will be used for either linear or swizzled
* buffers.
*/
buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
buf_off = v_off[lane] + inst_offset;
if (rsrc_desc.swizzleEn) {
Addr idx_stride = 8 << rsrc_desc.idxStride;
Addr elem_size = 2 << rsrc_desc.elemSize;
Addr idx_msb = buf_idx / idx_stride;
Addr idx_lsb = buf_idx % idx_stride;
Addr off_msb = buf_off / elem_size;
Addr off_lsb = buf_off % elem_size;
DPRINTF(VEGA, "mubuf swizzled lane %d: "
"idx_stride = %llx, elem_size = %llx, "
"idx_msb = %llx, idx_lsb = %llx, "
"off_msb = %llx, off_lsb = %llx\n",
lane, idx_stride, elem_size, idx_msb, idx_lsb,
off_msb, off_lsb);
buffer_offset =(idx_msb * stride + off_msb * elem_size)
* idx_stride + idx_lsb * elem_size + off_lsb;
} else {
buffer_offset = buf_off + stride * buf_idx;
}
/**
* Range check behavior causes out of range accesses to
* to be treated differently. Out of range accesses return
* 0 for loads and are ignored for stores. For
* non-formatted accesses, this is done on a per-lane
* basis.
*/
if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
if (buffer_offset >=
rsrc_desc.numRecords - s_offset.rawData()) {
DPRINTF(VEGA, "mubuf out-of-bounds condition 1: "
"lane = %d, buffer_offset = %llx, "
"const_stride = %llx, "
"const_num_records = %llx\n",
lane, buf_off + stride * buf_idx,
stride, rsrc_desc.numRecords);
oobMask.set(lane);
continue;
}
}
if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
if (buf_idx >= rsrc_desc.numRecords ||
buf_off >= stride) {
DPRINTF(VEGA, "mubuf out-of-bounds condition 2: "
"lane = %d, offset = %llx, "
"index = %llx, "
"const_num_records = %llx\n",
lane, buf_off, buf_idx,
rsrc_desc.numRecords);
oobMask.set(lane);
continue;
}
}
vaddr += buffer_offset;
DPRINTF(VEGA, "Calculating mubuf address for lane %d: "
"vaddr = %llx, base_addr = %llx, "
"stride = %llx, buf_idx = %llx, buf_off = %llx\n",
lane, vaddr, base_addr, stride,
buf_idx, buf_off);
gpuDynInst->addr.at(lane) = vaddr;
}
}
}
// first instruction DWORD
InFmt_MUBUF instData;
// second instruction DWORD
InFmt_MUBUF_1 extData;
// Mask of lanes with out-of-bounds accesses. Needs to be tracked
// seperately from the exec_mask so that we remember to write zero
// to the registers associated with out of bounds lanes.
VectorMask oobMask;
}; // Inst_MUBUF
class Inst_MTBUF : public VEGAGPUStaticInst
{
public:
Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
~Inst_MTBUF();
int instSize() const override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_MTBUF instData;
// second instruction DWORD
InFmt_MTBUF_1 extData;
private:
bool hasSecondDword(InFmt_MTBUF *);
}; // Inst_MTBUF
class Inst_MIMG : public VEGAGPUStaticInst
{
public:
Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
~Inst_MIMG();
int instSize() const override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_MIMG instData;
// second instruction DWORD
InFmt_MIMG_1 extData;
}; // Inst_MIMG
class Inst_EXP : public VEGAGPUStaticInst
{
public:
Inst_EXP(InFmt_EXP*, const std::string &opcode);
~Inst_EXP();
int instSize() const override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_EXP instData;
// second instruction DWORD
InFmt_EXP_1 extData;
}; // Inst_EXP
class Inst_FLAT : public VEGAGPUStaticInst
{
public:
Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
~Inst_FLAT();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
template<typename T>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
= wf->ldsChunk->read<T>(vaddr);
}
}
}
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
for (int i = 0; i < N; ++i) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]
= wf->ldsChunk->read<VecElemU32>(
vaddr + i*sizeof(VecElemU32));
}
}
}
}
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
wf->ldsChunk->write<T>(vaddr,
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
}
}
}
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
for (int i = 0; i < N; ++i) {
wf->ldsChunk->write<VecElemU32>(
vaddr + i*sizeof(VecElemU32),
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]);
}
}
}
}
}
template<typename T>
void
initAtomicAccess(GPUDynInstPtr gpuDynInst)
{
// Flat scratch requests may not be atomic according to ISA manual
// up to MI200. See MI200 manual Table 45.
assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
auto amo_op =
gpuDynInst->makeAtomicOpFunctor<T>(
&(reinterpret_cast<T*>(
gpuDynInst->a_data))[lane],
&(reinterpret_cast<T*>(
gpuDynInst->x_data))[lane]);
T tmp = wf->ldsChunk->read<T>(vaddr);
(*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
wf->ldsChunk->write<T>(vaddr, tmp);
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
}
}
}
}
void
calcAddr(GPUDynInstPtr gpuDynInst, ScalarRegU32 vaddr,
ScalarRegU32 saddr, ScalarRegI32 offset)
{
// Offset is a 13-bit field w/the following meanings:
// In Flat instructions, offset is a 12-bit unsigned number
// In Global/Scratch instructions, offset is a 13-bit signed number
if (isFlat()) {
offset = offset & 0xfff;
} else {
offset = (ScalarRegI32)sext<13>(offset);
}
// If saddr = 0x7f there is no scalar reg to read and address will
// be a 64-bit address. Otherwise, saddr is the reg index for a
// scalar reg used as the base address for a 32-bit address.
if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
ConstVecOperandU64 vbase(gpuDynInst, vaddr);
vbase.read();
calcAddrVgpr(gpuDynInst, vbase, offset);
} else if (isFlatGlobal()) {
// Assume we are operating in 64-bit mode and read a pair of
// SGPRs for the address base.
ConstScalarOperandU64 sbase(gpuDynInst, saddr);
sbase.read();
ConstVecOperandU32 voffset(gpuDynInst, vaddr);
voffset.read();
calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
// For scratch, saddr = 0x7f there is no scalar reg to read and
// a vgpr will be used for address offset. Otherwise, saddr is
// the sgpr index holding the address offset. For scratch
// instructions the offset GPR is always 32-bits.
} else if (saddr != 0x7f) {
assert(isFlatScratch());
ConstScalarOperandU32 soffset(gpuDynInst, saddr);
soffset.read();
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
int elemSize;
auto staticInst = gpuDynInst->staticInstruction();
if (gpuDynInst->isLoad()) {
elemSize = staticInst->getOperandSize(2);
} else {
assert(gpuDynInst->isStore());
elemSize = staticInst->getOperandSize(1);
}
unsigned swizzleOffset = soffset.rawData() + offset;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
gpuDynInst->addr.at(lane) = flat_scratch_addr
+ swizzle(swizzleOffset, lane, elemSize);
}
}
} else {
assert(isFlatScratch());
ConstVecOperandU32 voffset(gpuDynInst, vaddr);
voffset.read();
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
int elemSize;
auto staticInst = gpuDynInst->staticInstruction();
if (gpuDynInst->isLoad()) {
elemSize = staticInst->getOperandSize(2);
} else {
assert(gpuDynInst->isStore());
elemSize = staticInst->getOperandSize(1);
}
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
gpuDynInst->addr.at(lane) = flat_scratch_addr
+ swizzle(voffset[lane] + offset, lane, elemSize);
}
}
}
if (isFlat()) {
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
} else if (isFlatGlobal()) {
gpuDynInst->staticInstruction()->executed_as =
enums::SC_GLOBAL;
} else {
assert(isFlatScratch());
gpuDynInst->staticInstruction()->executed_as =
enums::SC_PRIVATE;
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
}
}
void
issueRequestHelper(GPUDynInstPtr gpuDynInst)
{
if ((gpuDynInst->executedAs() == enums::SC_GLOBAL && isFlat())
|| isFlatGlobal()) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
assert(isFlat());
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
}
}
// Execute for atomics is identical besides the flag set in the
// constructor, except cmpswap. For cmpswap, the offset to the "cmp"
// register is needed. For all other operations this offset is zero
// and implies the atomic is not a cmpswap.
// RegT defines the type of GPU register (e.g., ConstVecOperandU32)
// LaneT defines the type of the register elements (e.g., VecElemU32)
template<typename RegT, typename LaneT, int CmpRegOffset = 0>
void
atomicExecute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
RegT data(gpuDynInst, extData.DATA);
RegT cmp(gpuDynInst, extData.DATA + CmpRegOffset);
data.read();
if constexpr (CmpRegOffset) {
cmp.read();
}
calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
if constexpr (CmpRegOffset) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->x_data))[lane] = data[lane];
(reinterpret_cast<VecElemU32*>(
gpuDynInst->a_data))[lane] = cmp[lane];
} else {
(reinterpret_cast<LaneT*>(gpuDynInst->a_data))[lane]
= data[lane];
}
}
}
issueRequestHelper(gpuDynInst);
}
// RegT defines the type of GPU register (e.g., ConstVecOperandU32)
// LaneT defines the type of the register elements (e.g., VecElemU32)
template<typename RegT, typename LaneT>
void
atomicComplete(GPUDynInstPtr gpuDynInst)
{
if (isAtomicRet()) {
RegT vdst(gpuDynInst, extData.VDST);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst[lane] = (reinterpret_cast<LaneT*>(
gpuDynInst->d_data))[lane];
}
}
vdst.write();
}
}
bool
vgprIsOffset()
{
return (extData.SADDR != 0x7f);
}
// first instruction DWORD
InFmt_FLAT instData;
// second instruction DWORD
InFmt_FLAT_1 extData;
private:
void initFlatOperandInfo();
void initGlobalScratchOperandInfo();
void generateFlatDisassembly();
void generateGlobalScratchDisassembly();
void
calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr,
ConstScalarOperandU64 &saddr, ScalarRegI32 offset)
{
// Use SGPR pair as a base address and add VGPR-offset and
// instruction offset. The VGPR-offset is always 32-bits so we
// mask any upper bits from the vaddr.
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
ScalarRegI32 voffset = vaddr[lane];
gpuDynInst->addr.at(lane) =
saddr.rawData() + voffset + offset;
}
}
}
void
calcAddrVgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr,
ScalarRegI32 offset)
{
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
gpuDynInst->addr.at(lane) = addr[lane] + offset;
}
}
}
VecElemU32
swizzle(VecElemU32 offset, int lane, int elem_size)
{
// This is not described in the spec. We use the swizzle from
// buffer memory instructions and fix the stride to 4. Multiply
// the thread ID by the storage size to avoid threads clobbering
// their data.
return ((offset / 4) * 4 * 64)
+ (offset % 4) + (lane * elem_size);
}
Addr
readFlatScratch(GPUDynInstPtr gpuDynInst)
{
return gpuDynInst->computeUnit()->shader->getScratchBase();
}
}; // Inst_FLAT
} // namespace VegaISA
} // namespace gem5
#endif // __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__