arch-vega: Reorganize inst and misc files (#789)

This PR reorganizes the instructions.cc into multiple files and renames
some files which do not match their corresponding header file names. The
intention is to make iterating on development of these files faster.
This commit is contained in:
Matthew Poremba
2024-01-23 10:06:40 -08:00
committed by GitHub
26 changed files with 47291 additions and 46544 deletions

View File

@@ -49,13 +49,32 @@ Source('tlb_coalescer.cc')
DebugFlag('GPUPTWalker', 'Debug flag for GPU page table walker')
if env['CONF']['TARGET_GPU_ISA'] == 'vega':
Source('decoder.cc')
Source('gpu_decoder.cc')
Source('insts/gpu_static_inst.cc')
Source('insts/instructions.cc')
Source('insts/op_encodings.cc')
Source('isa.cc')
Source('registers.cc')
Source('gpu_isa.cc')
Source('gpu_registers.cc')
Source('insts/sop2.cc')
Source('insts/sopk.cc')
Source('insts/sop1.cc')
Source('insts/sopc.cc')
Source('insts/sopp.cc')
Source('insts/smem.cc')
Source('insts/vop2.cc')
Source('insts/vop1.cc')
Source('insts/vopc.cc')
Source('insts/vinterp.cc')
Source('insts/vop3.cc')
Source('insts/vop3_cmp.cc')
Source('insts/ds.cc')
Source('insts/mubuf.cc')
Source('insts/mtbuf.cc')
Source('insts/mimg.cc')
Source('insts/exp.cc')
Source('insts/flat.cc')
Source('insts/vop3p.cc')
Source('insts/vop3p_mai.cc')
DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA')

View File

@@ -29,9 +29,10 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/vega/gpu_decoder.hh"
#include <vector>
#include "arch/amdgpu/vega/gpu_decoder.hh"
#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
#include "arch/amdgpu/vega/insts/instructions.hh"
#include "arch/amdgpu/vega/insts/vop3p.hh"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,58 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/vega/insts/instructions.hh"
namespace gem5
{
namespace VegaISA
{
// --- Inst_EXP__EXP class methods ---
Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
: Inst_EXP(iFmt, "exp")
{
} // Inst_EXP__EXP
Inst_EXP__EXP::~Inst_EXP__EXP()
{
} // ~Inst_EXP__EXP
// --- description from .arch file ---
// Export through SX.
void
Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
} // namespace VegaISA
} // namespace gem5

File diff suppressed because it is too large Load Diff

View File

@@ -35,6 +35,7 @@
#include <cmath>
#include "arch/amdgpu/vega/gpu_registers.hh"
#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
namespace gem5
{
@@ -315,7 +316,8 @@ namespace VegaISA
* 0x142: broadcast 15th thread of each row to next row
* 0x143: broadcast thread 31 to rows 2 and 3
*/
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
inline int
dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
int rowOffset, bool & outOfBounds)
{
// local variables

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,584 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/vega/insts/instructions.hh"
namespace gem5
{
namespace VegaISA
{
// --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_X class methods ---
Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_load_format_x")
{
setFlag(MemoryRef);
setFlag(Load);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
{
} // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
// --- description from .arch file ---
// Typed buffer load 1 dword with format conversion.
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY class methods ---
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
{
setFlag(MemoryRef);
setFlag(Load);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
{
} // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
// --- description from .arch file ---
// Typed buffer load 2 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ class methods ---
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
{
setFlag(MemoryRef);
setFlag(Load);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
{
} // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
// --- description from .arch file ---
// Typed buffer load 3 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW class methods ---
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
{
setFlag(MemoryRef);
setFlag(Load);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
{
} // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
// --- description from .arch file ---
// Typed buffer load 4 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_STORE_FORMAT_X class methods ---
Inst_MTBUF__TBUFFER_STORE_FORMAT_X
::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_store_format_x")
{
setFlag(MemoryRef);
setFlag(Store);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_STORE_FORMAT_X
Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
{
} // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X
// --- description from .arch file ---
// Typed buffer store 1 dword with format conversion.
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XY class methods ---
Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
{
setFlag(MemoryRef);
setFlag(Store);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
{
} // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
// --- description from .arch file ---
// Typed buffer store 2 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ class methods ---
Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
{
setFlag(MemoryRef);
setFlag(Store);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
{
} // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
// --- description from .arch file ---
// Typed buffer store 3 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW class methods ---
Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
{
setFlag(MemoryRef);
setFlag(Store);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
{
} // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
// --- description from .arch file ---
// Typed buffer store 4 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X class methods ---
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
{
setFlag(MemoryRef);
setFlag(Load);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
{
} // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
// --- description from .arch file ---
// Typed buffer load 1 dword with format conversion.
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY class methods ---
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
{
setFlag(MemoryRef);
setFlag(Load);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
{
} // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
// --- description from .arch file ---
// Typed buffer load 2 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ class methods ---
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
{
setFlag(MemoryRef);
setFlag(Load);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
{
} // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
// --- description from .arch file ---
// Typed buffer load 3 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW class methods ---
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
{
setFlag(MemoryRef);
setFlag(Load);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
{
} // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
// --- description from .arch file ---
// Typed buffer load 4 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X class methods ---
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
{
setFlag(MemoryRef);
setFlag(Store);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
{
} // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
// --- description from .arch file ---
// Typed buffer store 1 dword with format conversion.
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY class methods ---
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
{
setFlag(MemoryRef);
setFlag(Store);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
{
} // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
// --- description from .arch file ---
// Typed buffer store 2 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ class methods ---
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
{
setFlag(MemoryRef);
setFlag(Store);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
{
} // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
// --- description from .arch file ---
// Typed buffer store 3 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW class methods ---
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
: Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
{
setFlag(MemoryRef);
setFlag(Store);
setFlag(GlobalSegment);
} // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
{
} // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
// --- description from .arch file ---
// Typed buffer store 4 dwords with format conversion.
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
GPUDynInstPtr gpuDynInst)
{
} // initiateAcc
void
Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
GPUDynInstPtr gpuDynInst)
{
} // execute
} // namespace VegaISA
} // namespace gem5

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,599 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/vega/insts/instructions.hh"
namespace gem5
{
namespace VegaISA
{
// --- Inst_SOPC__S_CMP_EQ_I32 class methods ---
Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_eq_i32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_EQ_I32
Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
{
} // ~Inst_SOPC__S_CMP_EQ_I32
// --- description from .arch file ---
// SCC = (S0.i == S1.i).
void
Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_LG_I32 class methods ---
Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_lg_i32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_LG_I32
Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
{
} // ~Inst_SOPC__S_CMP_LG_I32
// --- description from .arch file ---
// SCC = (S0.i != S1.i).
void
Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_GT_I32 class methods ---
Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_gt_i32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_GT_I32
Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
{
} // ~Inst_SOPC__S_CMP_GT_I32
// --- description from .arch file ---
// SCC = (S0.i > S1.i).
void
Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_GE_I32 class methods ---
Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_ge_i32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_GE_I32
Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
{
} // ~Inst_SOPC__S_CMP_GE_I32
// --- description from .arch file ---
// SCC = (S0.i >= S1.i).
void
Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_LT_I32 class methods ---
Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_lt_i32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_LT_I32
Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
{
} // ~Inst_SOPC__S_CMP_LT_I32
// --- description from .arch file ---
// SCC = (S0.i < S1.i).
void
Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_LE_I32 class methods ---
Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_le_i32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_LE_I32
Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
{
} // ~Inst_SOPC__S_CMP_LE_I32
// --- description from .arch file ---
// SCC = (S0.i <= S1.i).
void
Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_EQ_U32 class methods ---
Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_eq_u32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_EQ_U32
Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
{
} // ~Inst_SOPC__S_CMP_EQ_U32
// --- description from .arch file ---
// SCC = (S0.u == S1.u).
void
Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_LG_U32 class methods ---
Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_lg_u32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_LG_U32
Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
{
} // ~Inst_SOPC__S_CMP_LG_U32
// --- description from .arch file ---
// SCC = (S0.u != S1.u).
void
Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_GT_U32 class methods ---
Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_gt_u32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_GT_U32
Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
{
} // ~Inst_SOPC__S_CMP_GT_U32
// --- description from .arch file ---
// SCC = (S0.u > S1.u).
void
Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_GE_U32 class methods ---
Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_ge_u32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_GE_U32
Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
{
} // ~Inst_SOPC__S_CMP_GE_U32
// --- description from .arch file ---
// SCC = (S0.u >= S1.u).
void
Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_LT_U32 class methods ---
Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_lt_u32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_LT_U32
Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
{
} // ~Inst_SOPC__S_CMP_LT_U32
// --- description from .arch file ---
// SCC = (S0.u < S1.u).
void
Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_LE_U32 class methods ---
Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_le_u32")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_LE_U32
Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
{
} // ~Inst_SOPC__S_CMP_LE_U32
// --- description from .arch file ---
// SCC = (S0.u <= S1.u).
void
Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_BITCMP0_B32 class methods ---
Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_bitcmp0_b32")
{
setFlag(ALU);
} // Inst_SOPC__S_BITCMP0_B32
Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
{
} // ~Inst_SOPC__S_BITCMP0_B32
// --- description from .arch file ---
// SCC = (S0.u[S1.u[4:0]] == 0).
void
Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_BITCMP1_B32 class methods ---
Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_bitcmp1_b32")
{
setFlag(ALU);
} // Inst_SOPC__S_BITCMP1_B32
Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
{
} // ~Inst_SOPC__S_BITCMP1_B32
// --- description from .arch file ---
// SCC = (S0.u[S1.u[4:0]] == 1).
void
Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_BITCMP0_B64 class methods ---
Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_bitcmp0_b64")
{
setFlag(ALU);
} // Inst_SOPC__S_BITCMP0_B64
Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
{
} // ~Inst_SOPC__S_BITCMP0_B64
// --- description from .arch file ---
// SCC = (S0.u64[S1.u[5:0]] == 0).
void
Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_BITCMP1_B64 class methods ---
Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_bitcmp1_b64")
{
setFlag(ALU);
} // Inst_SOPC__S_BITCMP1_B64
Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
{
} // ~Inst_SOPC__S_BITCMP1_B64
// --- description from .arch file ---
// SCC = (S0.u64[S1.u[5:0]] == 1).
void
Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_SETVSKIP class methods ---
Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_setvskip")
{
} // Inst_SOPC__S_SETVSKIP
Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
{
} // ~Inst_SOPC__S_SETVSKIP
// --- description from .arch file ---
// VSKIP = S0.u[S1.u[4:0]].
// Enables and disables VSKIP mode.
// When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
// issued.
// If any vector operations are outstanding, S_WAITCNT must be issued
// before executing.
// This instruction requires one waitstate after executing (e.g. S_NOP 0).
// Example:
// s_waitcnt 0
// s_setvskip 1, 0 // Enable vskip mode.
// s_nop 1
void
Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPC__S_SET_GPR_IDX_ON class methods ---
Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_set_gpr_idx_on")
{
} // Inst_SOPC__S_SET_GPR_IDX_ON
Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
{
} // ~Inst_SOPC__S_SET_GPR_IDX_ON
// --- description from .arch file ---
// MODE.gpr_idx_en = 1;
// M0[7:0] = S0.u[7:0];
// M0[15:12] = SIMM4 (direct contents of S1 field);
// // Remaining bits of M0 are unmodified.
// Enable GPR indexing mode. Vector operations after this will perform
// relative GPR addressing based on the contents of M0. The structure
// SQ_M0_GPR_IDX_WORD may be used to decode M0.
// The raw contents of the S1 field are read and used to set the enable
// bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
// S1[3] = VDST_REL.
void
Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPC__S_CMP_EQ_U64 class methods ---
Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_eq_u64")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_EQ_U64
Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
{
} // ~Inst_SOPC__S_CMP_EQ_U64
// --- description from .arch file ---
// SCC = (S0.i64 == S1.i64).
void
Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPC__S_CMP_LG_U64 class methods ---
Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
: Inst_SOPC(iFmt, "s_cmp_lg_u64")
{
setFlag(ALU);
} // Inst_SOPC__S_CMP_LG_U64
Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
{
} // ~Inst_SOPC__S_CMP_LG_U64
// --- description from .arch file ---
// SCC = (S0.i64 != S1.i64).
void
Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
{
ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src0.read();
src1.read();
scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
scc.write();
} // execute
} // namespace VegaISA
} // namespace gem5

View File

@@ -0,0 +1,648 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/vega/insts/instructions.hh"
#include "dev/amdgpu/hwreg_defines.hh"
#include "gpu-compute/shader.hh"
namespace gem5
{
namespace VegaISA
{
// --- Inst_SOPK__S_MOVK_I32 class methods ---
Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_movk_i32")
{
setFlag(ALU);
} // Inst_SOPK__S_MOVK_I32
Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
{
} // ~Inst_SOPK__S_MOVK_I32
// --- description from .arch file ---
// D.i = signext(SIMM16) (sign extension).
void
Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
sdst = simm16;
sdst.write();
} // execute
// --- Inst_SOPK__S_CMOVK_I32 class methods ---
Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmovk_i32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMOVK_I32
Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
{
} // ~Inst_SOPK__S_CMOVK_I32
// --- description from .arch file ---
// if (SCC) then D.i = signext(SIMM16);
// else NOP.
// Conditional move with sign extension.
void
Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
scc.read();
if (scc.rawData()) {
sdst = simm16;
sdst.write();
}
} // execute
// --- Inst_SOPK__S_CMPK_EQ_I32 class methods ---
Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_eq_i32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_EQ_I32
Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
{
} // ~Inst_SOPK__S_CMPK_EQ_I32
// --- description from .arch file ---
// SCC = (S0.i == signext(SIMM16)).
void
Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() == simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_LG_I32 class methods ---
Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_lg_i32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_LG_I32
Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
{
} // ~Inst_SOPK__S_CMPK_LG_I32
// --- description from .arch file ---
// SCC = (S0.i != signext(SIMM16)).
void
Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() != simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_GT_I32 class methods ---
Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_gt_i32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_GT_I32
Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
{
} // ~Inst_SOPK__S_CMPK_GT_I32
// --- description from .arch file ---
// SCC = (S0.i > signext(SIMM16)).
void
Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() > simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_GE_I32 class methods ---
Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_ge_i32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_GE_I32
Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
{
} // ~Inst_SOPK__S_CMPK_GE_I32
// --- description from .arch file ---
// SCC = (S0.i >= signext(SIMM16)).
void
Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() >= simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_LT_I32 class methods ---
Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_lt_i32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_LT_I32
Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
{
} // ~Inst_SOPK__S_CMPK_LT_I32
// --- description from .arch file ---
// SCC = (S0.i < signext(SIMM16)).
void
Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() < simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_LE_I32 class methods ---
Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_le_i32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_LE_I32
Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
{
} // ~Inst_SOPK__S_CMPK_LE_I32
// --- description from .arch file ---
// SCC = (S0.i <= signext(SIMM16)).
void
Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() <= simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_EQ_U32 class methods ---
Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_eq_u32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_EQ_U32
Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
{
} // ~Inst_SOPK__S_CMPK_EQ_U32
// --- description from .arch file ---
// SCC = (S0.u == SIMM16).
void
Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() == simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_LG_U32 class methods ---
Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_lg_u32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_LG_U32
Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
{
} // ~Inst_SOPK__S_CMPK_LG_U32
// --- description from .arch file ---
// SCC = (S0.u != SIMM16).
void
Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() != simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_GT_U32 class methods ---
Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_gt_u32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_GT_U32
Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
{
} // ~Inst_SOPK__S_CMPK_GT_U32
// --- description from .arch file ---
// SCC = (S0.u > SIMM16).
void
Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() > simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_GE_U32 class methods ---
Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_ge_u32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_GE_U32
Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
{
} // ~Inst_SOPK__S_CMPK_GE_U32
// --- description from .arch file ---
// SCC = (S0.u >= SIMM16).
void
Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() >= simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_LT_U32 class methods ---
Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_lt_u32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_LT_U32
Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
{
} // ~Inst_SOPK__S_CMPK_LT_U32
// --- description from .arch file ---
// SCC = (S0.u < SIMM16).
void
Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() < simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_CMPK_LE_U32 class methods ---
Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cmpk_le_u32")
{
setFlag(ALU);
} // Inst_SOPK__S_CMPK_LE_U32
Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
{
} // ~Inst_SOPK__S_CMPK_LE_U32
// --- description from .arch file ---
// SCC = (S0.u <= SIMM16).
void
Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
scc = (src.rawData() <= simm16) ? 1 : 0;
scc.write();
} // execute
// --- Inst_SOPK__S_ADDK_I32 class methods ---
Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_addk_i32")
{
setFlag(ALU);
} // Inst_SOPK__S_ADDK_I32
Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
{
} // ~Inst_SOPK__S_ADDK_I32
// --- description from .arch file ---
// D.i = D.i + signext(SIMM16);
// SCC = overflow.
void
Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI16 simm16 = instData.SIMM16;
ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
ScalarOperandU32 scc(gpuDynInst, REG_SCC);
src.read();
sdst = src.rawData() + (ScalarRegI32)sext<16>(simm16);
scc = (bits(src.rawData(), 31) == bits(simm16, 15)
&& bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
sdst.write();
scc.write();
} // execute
// --- Inst_SOPK__S_MULK_I32 class methods ---
Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_mulk_i32")
{
setFlag(ALU);
} // Inst_SOPK__S_MULK_I32
Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
{
} // ~Inst_SOPK__S_MULK_I32
// --- description from .arch file ---
// D.i = D.i * signext(SIMM16).
void
Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI16 simm16 = instData.SIMM16;
ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
src.read();
sdst = src.rawData() * (ScalarRegI32)sext<16>(simm16);
sdst.write();
} // execute
// --- Inst_SOPK__S_CBRANCH_I_FORK class methods ---
Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_cbranch_i_fork")
{
setFlag(Branch);
} // Inst_SOPK__S_CBRANCH_I_FORK
Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
{
} // ~Inst_SOPK__S_CBRANCH_I_FORK
// --- description from .arch file ---
// mask_pass = S0.u64 & EXEC;
// mask_fail = ~S0.u64 & EXEC;
// target_addr = PC + signext(SIMM16 * 4) + 4;
// if (mask_pass == EXEC)
// PC = target_addr;
// elsif (mask_fail == EXEC)
// PC += 4;
// elsif (bitcount(mask_fail) < bitcount(mask_pass))
// EXEC = mask_fail;
// SGPR[CSP*4] = { target_addr, mask_pass };
// CSP++;
// PC += 4;
// else
// EXEC = mask_pass;
// SGPR[CSP*4] = { PC + 4, mask_fail };
// CSP++;
// PC = target_addr;
// end.
// Conditional branch using branch-stack.
// S0 = compare mask(vcc or any sgpr), and
// SIMM16 = signed DWORD branch offset relative to next instruction.
// See also S_CBRANCH_JOIN.
void
Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPK__S_GETREG_B32 class methods ---
Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_getreg_b32")
{
setFlag(ALU);
} // Inst_SOPK__S_GETREG_B32
Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
{
} // ~Inst_SOPK__S_GETREG_B32
// --- description from .arch file ---
// D.u = hardware-reg. Read some or all of a hardware register into the
// LSBs of D.
// SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
// is 1..32.
void
Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI16 simm16 = instData.SIMM16;
ScalarRegU32 hwregId = simm16 & 0x3f;
ScalarRegU32 offset = (simm16 >> 6) & 31;
ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
ScalarRegU32 hwreg =
gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
sdst.read();
// Store value from hardware to part of the SDST.
ScalarRegU32 mask = (((1U << size) - 1U) << offset);
sdst = (hwreg & mask) >> offset;
sdst.write();
} // execute
// --- Inst_SOPK__S_SETREG_B32 class methods ---
Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_setreg_b32")
{
setFlag(ALU);
} // Inst_SOPK__S_SETREG_B32
Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
{
} // ~Inst_SOPK__S_SETREG_B32
// --- description from .arch file ---
// hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
// register.
// SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
// is 1..32.
void
Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI16 simm16 = instData.SIMM16;
ScalarRegU32 hwregId = simm16 & 0x3f;
ScalarRegU32 offset = (simm16 >> 6) & 31;
ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
ScalarRegU32 hwreg =
gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
sdst.read();
// Store value from SDST to part of the hardware register.
ScalarRegU32 mask = (((1U << size) - 1U) << offset);
hwreg = ((hwreg & ~mask) | ((sdst.rawData() << offset) & mask));
gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
// set MODE register to control the behavior of single precision
// floating-point numbers: denormal mode or round mode
if (hwregId==1 && size==2
&& (offset==4 || offset==0)) {
warn_once("Be cautious that s_setreg_b32 has no real effect "
"on FP modes: %s\n", gpuDynInst->disassemble());
return;
}
// panic if not changing MODE of floating-point numbers
panicUnimplemented();
} // execute
// --- Inst_SOPK__S_SETREG_IMM32_B32 class methods ---
Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
InFmt_SOPK *iFmt)
: Inst_SOPK(iFmt, "s_setreg_imm32_b32")
{
setFlag(ALU);
} // Inst_SOPK__S_SETREG_IMM32_B32
Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
{
} // ~Inst_SOPK__S_SETREG_IMM32_B32
// --- description from .arch file ---
// Write some or all of the LSBs of IMM32 into a hardware register; this
// --- instruction requires a 32-bit literal constant.
// SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
// is 1..32.
void
Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI16 simm16 = instData.SIMM16;
ScalarRegU32 hwregId = simm16 & 0x3f;
ScalarRegU32 offset = (simm16 >> 6) & 31;
ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
ScalarRegU32 hwreg =
gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
ScalarRegI32 simm32 = extData.imm_u32;
// Store value from SIMM32 to part of the hardware register.
ScalarRegU32 mask = (((1U << size) - 1U) << offset);
hwreg = ((hwreg & ~mask) | ((simm32 << offset) & mask));
gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
// set MODE register to control the behavior of single precision
// floating-point numbers: denormal mode or round mode
if (hwregId==HW_REG_MODE && size==2
&& (offset==4 || offset==0)) {
warn_once("Be cautious that s_setreg_imm32_b32 has no real effect "
"on FP modes: %s\n", gpuDynInst->disassemble());
return;
}
// panic if not changing modes of single-precision FPs
panicUnimplemented();
} // execute
} // namespace VegaISA
} // namespace gem5

View File

@@ -0,0 +1,900 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/vega/insts/instructions.hh"
#include "debug/GPUSync.hh"
#include "gpu-compute/shader.hh"
namespace gem5
{
namespace VegaISA
{
// --- Inst_SOPP__S_NOP class methods ---
Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_nop")
{
setFlag(Nop);
} // Inst_SOPP__S_NOP
Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
{
} // ~Inst_SOPP__S_NOP
// --- description from .arch file ---
// Do nothing. Repeat NOP 1..8 times based on SIMM16[2:0] -- 0 = 1 time,
// 7 = 8 times.
// This instruction may be used to introduce wait states to resolve
// hazards; see the shader programming guide for details. Compare with
// S_SLEEP.
void
Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
{
} // execute
// --- Inst_SOPP__S_ENDPGM class methods ---
Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_endpgm")
{
setFlag(EndOfKernel);
} // Inst_SOPP__S_ENDPGM
Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
{
} // ~Inst_SOPP__S_ENDPGM
// --- description from .arch file ---
// End of program; terminate wavefront.
// The hardware implicitly executes S_WAITCNT 0 before executing this
// --- instruction.
// See S_ENDPGM_SAVED for the context-switch version of this instruction.
void
Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
ComputeUnit *cu = gpuDynInst->computeUnit();
// delete extra instructions fetched for completed work-items
wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
wf->instructionBuffer.end());
if (wf->pendingFetch) {
wf->dropFetch = true;
}
wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
.flushBuf(wf->wfSlotId);
wf->setStatus(Wavefront::S_STOPPED);
int refCount = wf->computeUnit->getLds()
.decreaseRefCounter(wf->dispatchId, wf->wgId);
/**
* The parent WF of this instruction is exiting, therefore
* it should not participate in this barrier any longer. This
* prevents possible deadlock issues if WFs exit early.
*/
int bar_id = WFBarrier::InvalidID;
if (wf->hasBarrier()) {
assert(wf->getStatus() != Wavefront::S_BARRIER);
bar_id = wf->barrierId();
assert(bar_id != WFBarrier::InvalidID);
wf->releaseBarrier();
cu->decMaxBarrierCnt(bar_id);
DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
"program and decrementing max barrier count for "
"barrier Id%d. New max count: %d.\n", cu->cu_id,
wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
cu->maxBarrierCnt(bar_id));
}
DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
wf->computeUnit->cu_id, wf->wgId, refCount);
wf->computeUnit->registerManager->freeRegisters(wf);
wf->computeUnit->stats.completedWfs++;
wf->computeUnit->activeWaves--;
panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
"than zero\n", wf->computeUnit->cu_id);
DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);
for (int i = 0; i < wf->vecReads.size(); i++) {
if (wf->rawDist.find(i) != wf->rawDist.end()) {
wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
}
}
wf->vecReads.clear();
wf->rawDist.clear();
wf->lastInstExec = 0;
if (!refCount) {
/**
* If all WFs have finished, and hence the WG has finished,
* then we can free up the barrier belonging to the parent
* WG, but only if we actually used a barrier (i.e., more
* than one WF in the WG).
*/
if (bar_id != WFBarrier::InvalidID) {
DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
"now complete. Releasing barrier Id%d.\n", cu->cu_id,
wf->simdId, wf->wfSlotId, wf->wfDynId,
wf->barrierId());
cu->releaseBarrier(bar_id);
}
/**
* Last wavefront of the workgroup has executed return. If the
* workgroup is not the final one in the kernel, then simply
* retire it; however, if it is the final one, i.e., indicating
* the kernel end, then release operation (i.e., GL2 WB) is
* needed
*/
//check whether the workgroup is indicating the kernel end, i.e.,
//the last workgroup in the kernel
bool kernelEnd =
wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
bool relNeeded =
wf->computeUnit->shader->impl_kern_end_rel;
//if it is not a kernel end, then retire the workgroup directly
if (!kernelEnd || !relNeeded) {
wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
wf->setStatus(Wavefront::S_STOPPED);
wf->computeUnit->stats.completedWGs++;
return;
}
/**
* if it is a kernel end, inject a memory sync, i.e., GL2 WB, and
* retire the workgroup after receving response.
* note that GL0V and GL1 are read only, and they just forward GL2
* WB request. When forwarding, GL1 send the request to all GL2 in
* the complex
*/
setFlag(MemSync);
setFlag(GlobalSegment);
// Notify Memory System of Kernel Completion
// Kernel End = isKernel + isMemSync
wf->setStatus(Wavefront::S_RETURNING);
gpuDynInst->simdId = wf->simdId;
gpuDynInst->wfSlotId = wf->wfSlotId;
gpuDynInst->wfDynId = wf->wfDynId;
DPRINTF(GPUExec, "inject global memory fence for CU%d: "
"WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
wf->simdId, wf->wfSlotId, wf->wfDynId);
// call shader to prepare the flush operations
wf->computeUnit->shader->prepareFlush(gpuDynInst);
wf->computeUnit->stats.completedWGs++;
} else {
wf->computeUnit->shader->dispatcher().scheduleDispatch();
}
} // execute
// --- Inst_SOPP__S_BRANCH class methods ---
Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_branch")
{
setFlag(Branch);
} // Inst_SOPP__S_BRANCH
Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
{
} // ~Inst_SOPP__S_BRANCH
// --- description from .arch file ---
// PC = PC + signext(SIMM16 * 4) + 4 (short jump).
// For a long jump, use S_SETPC.
void
Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
Addr pc = gpuDynInst->pc();
ScalarRegI16 simm16 = instData.SIMM16;
pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
wf->pc(pc);
} // execute
// --- Inst_SOPP__S_WAKEUP class methods ---
Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_wakeup")
{
} // Inst_SOPP__S_WAKEUP
Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
{
} // ~Inst_SOPP__S_WAKEUP
// --- description from .arch file ---
// Allow a wave to 'ping' all the other waves in its threadgroup to force
// them to wake up immediately from an S_SLEEP instruction. The ping is
// ignored if the waves are not sleeping.
// This allows for more efficient polling on a memory location. The waves
// which are polling can sit in a long S_SLEEP between memory reads, but
// the wave which writes the value can tell them all to wake up early now
// that the data is available. This is useful for fBarrier implementations
// (speedup).
// This method is also safe from races because if any wave misses the ping,
// everything still works fine (whoever missed it just completes their
// normal S_SLEEP).
void
Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_CBRANCH_SCC0 class methods ---
Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_cbranch_scc0")
{
setFlag(Branch);
} // Inst_SOPP__S_CBRANCH_SCC0
Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
{
} // ~Inst_SOPP__S_CBRANCH_SCC0
// --- description from .arch file ---
// if (SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
// else NOP.
void
Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
Addr pc = gpuDynInst->pc();
ScalarRegI16 simm16 = instData.SIMM16;
ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
scc.read();
if (!scc.rawData()) {
pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
}
wf->pc(pc);
} // execute
// --- Inst_SOPP__S_CBRANCH_SCC1 class methods ---
Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_cbranch_scc1")
{
setFlag(Branch);
} // Inst_SOPP__S_CBRANCH_SCC1
Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
{
} // ~Inst_SOPP__S_CBRANCH_SCC1
// --- description from .arch file ---
// if (SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
// else NOP.
void
Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
Addr pc = gpuDynInst->pc();
ScalarRegI16 simm16 = instData.SIMM16;
ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
scc.read();
if (scc.rawData()) {
pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
}
wf->pc(pc);
} // execute
// --- Inst_SOPP__S_CBRANCH_VCCZ class methods ---
Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_cbranch_vccz")
{
setFlag(Branch);
setFlag(ReadsVCC);
} // Inst_SOPP__S_CBRANCH_VCCZ
Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
{
} // ~Inst_SOPP__S_CBRANCH_VCCZ
// --- description from .arch file ---
// if (VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
// else NOP.
void
Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
Addr pc = gpuDynInst->pc();
ScalarRegI16 simm16 = instData.SIMM16;
vcc.read();
if (!vcc.rawData()) {
pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
}
wf->pc(pc);
} // execute
// --- Inst_SOPP__S_CBRANCH_VCCNZ class methods ---
Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_cbranch_vccnz")
{
setFlag(Branch);
setFlag(ReadsVCC);
} // Inst_SOPP__S_CBRANCH_VCCNZ
Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
{
} // ~Inst_SOPP__S_CBRANCH_VCCNZ
// --- description from .arch file ---
// if (VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
// else NOP.
void
Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
vcc.read();
if (vcc.rawData()) {
Addr pc = gpuDynInst->pc();
ScalarRegI16 simm16 = instData.SIMM16;
pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
wf->pc(pc);
}
} // execute
// --- Inst_SOPP__S_CBRANCH_EXECZ class methods ---
Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_cbranch_execz")
{
setFlag(Branch);
setFlag(ReadsEXEC);
} // Inst_SOPP__S_CBRANCH_EXECZ
Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
{
} // ~Inst_SOPP__S_CBRANCH_EXECZ
// --- description from .arch file ---
// if (EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
// else NOP.
void
Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().none()) {
Addr pc = gpuDynInst->pc();
ScalarRegI16 simm16 = instData.SIMM16;
pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
wf->pc(pc);
}
} // execute
// --- Inst_SOPP__S_CBRANCH_EXECNZ class methods ---
Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_cbranch_execnz")
{
setFlag(Branch);
setFlag(ReadsEXEC);
} // Inst_SOPP__S_CBRANCH_EXECNZ
Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
{
} // ~Inst_SOPP__S_CBRANCH_EXECNZ
// --- description from .arch file ---
// if (EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
// else NOP.
void
Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
if (wf->execMask().any()) {
Addr pc = gpuDynInst->pc();
ScalarRegI16 simm16 = instData.SIMM16;
pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
wf->pc(pc);
}
} // execute
// --- Inst_SOPP__S_BARRIER class methods ---
Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_barrier")
{
setFlag(MemBarrier);
} // Inst_SOPP__S_BARRIER
Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
{
} // ~Inst_SOPP__S_BARRIER
// --- description from .arch file ---
// Synchronize waves within a threadgroup.
// If not all waves of the threadgroup have been created yet, waits for
// entire group before proceeding.
// If some waves in the threadgroup have already terminated, this waits on
// only the surviving waves.
// Barriers are legal inside trap handlers.
void
Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
ComputeUnit *cu = gpuDynInst->computeUnit();
if (wf->hasBarrier()) {
int bar_id = wf->barrierId();
assert(wf->getStatus() == Wavefront::S_BARRIER);
cu->incNumAtBarrier(bar_id);
DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
"barrier Id%d. %d waves now at barrier, %d waves "
"remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
cu->numYetToReachBarrier(bar_id));
}
} // execute
// --- Inst_SOPP__S_SETKILL class methods ---
Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_setkill")
{
} // Inst_SOPP__S_SETKILL
Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
{
} // ~Inst_SOPP__S_SETKILL
// --- description from .arch file ---
// set KILL bit to value of SIMM16[0].
// Used primarily for debugging kill wave host command behavior.
void
Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_WAITCNT class methods ---
Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_waitcnt")
{
setFlag(ALU);
setFlag(Waitcnt);
} // Inst_SOPP__S_WAITCNT
Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
{
} // ~Inst_SOPP__S_WAITCNT
// --- description from .arch file ---
// Wait for the counts of outstanding lds, vector-memory and
// --- export/vmem-write-data to be at or below the specified levels.
// SIMM16[3:0] = vmcount (vector memory operations),
// SIMM16[6:4] = export/mem-write-data count,
// SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
void
Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI32 vm_cnt = 0;
ScalarRegI32 exp_cnt = 0;
ScalarRegI32 lgkm_cnt = 0;
vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
gpuDynInst->wavefront()->setStatus(Wavefront::S_WAITCNT);
gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
} // execute
// --- Inst_SOPP__S_SETHALT class methods ---
Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_sethalt")
{
} // Inst_SOPP__S_SETHALT
Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
{
} // ~Inst_SOPP__S_SETHALT
// --- description from .arch file ---
// Set HALT bit to value of SIMM16[0]; 1 = halt, 0 = resume.
// The halt flag is ignored while PRIV == 1 (inside trap handlers) but the
// shader will halt immediately after the handler returns if HALT is still
// set at that time.
void
Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_SLEEP class methods ---
Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_sleep")
{
setFlag(ALU);
setFlag(Sleep);
} // Inst_SOPP__S_SLEEP
Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
{
} // ~Inst_SOPP__S_SLEEP
// --- description from .arch file ---
// Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
// The exact amount of delay is approximate. Compare with S_NOP.
void
Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
gpuDynInst->wavefront()->setStatus(Wavefront::S_STALLED_SLEEP);
// sleep duration is specified in multiples of 64 cycles
gpuDynInst->wavefront()->setSleepTime(64 * simm16);
} // execute
// --- Inst_SOPP__S_SETPRIO class methods ---
Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_setprio")
{
setFlag(ALU);
} // Inst_SOPP__S_SETPRIO
Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
{
} // ~Inst_SOPP__S_SETPRIO
// --- description from .arch file ---
// User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
// 3 = highest.
// The overall wave priority is {SPIPrio[1:0] + UserPrio[1:0],
// WaveAge[3:0]}.
void
Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
{
ScalarRegU16 simm16 = instData.SIMM16;
ScalarRegU32 userPrio = simm16 & 0x3;
warn_once("S_SETPRIO ignored -- Requested priority %d\n", userPrio);
} // execute
// --- Inst_SOPP__S_SENDMSG class methods ---
Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_sendmsg")
{
} // Inst_SOPP__S_SENDMSG
Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
{
} // ~Inst_SOPP__S_SENDMSG
// --- description from .arch file ---
// Send a message upstream to VGT or the interrupt handler.
// SIMM16[9:0] contains the message type and is documented in the shader
// --- programming guide.
void
Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_SENDMSGHALT class methods ---
Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_sendmsghalt")
{
} // Inst_SOPP__S_SENDMSGHALT
Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
{
} // ~Inst_SOPP__S_SENDMSGHALT
// --- description from .arch file ---
// Send a message and then HALT the wavefront; see S_SENDMSG for details.
void
Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_TRAP class methods ---
Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_trap")
{
} // Inst_SOPP__S_TRAP
Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
{
} // ~Inst_SOPP__S_TRAP
// --- description from .arch file ---
// TrapID = SIMM16[7:0];
// Wait for all instructions to complete;
// set {TTMP1, TTMP0} = {3'h0, PCRewind[3:0], HT[0], TrapID[7:0],
// PC[47:0]};
// PC = TBA (trap base address);
// PRIV = 1.
// Enter the trap handler. This instruction may be generated internally as
// well in response to a host trap (HT = 1) or an exception.
// TrapID 0 is reserved for hardware use and should not be used in a
// shader-generated trap.
void
Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_ICACHE_INV class methods ---
Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_icache_inv")
{
} // Inst_SOPP__S_ICACHE_INV
Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
{
} // ~Inst_SOPP__S_ICACHE_INV
// --- description from .arch file ---
// Invalidate entire L1 instruction cache.
// You must have 12 separate S_NOP instructions or a jump/branch
// instruction after this instruction
// to ensure the SQ instruction buffer is purged.
void
Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_INCPERFLEVEL class methods ---
Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_incperflevel")
{
} // Inst_SOPP__S_INCPERFLEVEL
Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
{
} // ~Inst_SOPP__S_INCPERFLEVEL
// --- description from .arch file ---
// Increment performance counter specified in SIMM16[3:0] by 1.
void
Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_DECPERFLEVEL class methods ---
Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_decperflevel")
{
} // Inst_SOPP__S_DECPERFLEVEL
Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
{
} // ~Inst_SOPP__S_DECPERFLEVEL
// --- description from .arch file ---
// Decrement performance counter specified in SIMM16[3:0] by 1.
void
Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_TTRACEDATA class methods ---
Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_ttracedata")
{
} // Inst_SOPP__S_TTRACEDATA
Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
{
} // ~Inst_SOPP__S_TTRACEDATA
// --- description from .arch file ---
// Send M0 as user data to the thread trace stream.
void
Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_CBRANCH_CDBGSYS class methods ---
Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
{
setFlag(Branch);
} // Inst_SOPP__S_CBRANCH_CDBGSYS
Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
{
} // ~Inst_SOPP__S_CBRANCH_CDBGSYS
// --- description from .arch file ---
// if (conditional_debug_system != 0) then PC = PC + signext(SIMM16 * 4)
// + 4;
// else NOP.
void
Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_CBRANCH_CDBGUSER class methods ---
Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_cbranch_cdbguser")
{
setFlag(Branch);
} // Inst_SOPP__S_CBRANCH_CDBGUSER
Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
{
} // ~Inst_SOPP__S_CBRANCH_CDBGUSER
// --- description from .arch file ---
// if (conditional_debug_user != 0) then PC = PC + signext(SIMM16 * 4) + 4;
// else NOP.
void
Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER class methods ---
Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
{
setFlag(Branch);
} // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
{
} // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
// --- description from .arch file ---
// if (conditional_debug_system || conditional_debug_user) then PC = PC +
// --- signext(SIMM16 * 4) + 4;
// else NOP.
void
Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER class methods ---
Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
{
setFlag(Branch);
} // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
{
} // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
// --- description from .arch file ---
// if (conditional_debug_system && conditional_debug_user) then PC = PC +
// --- signext(SIMM16 * 4) + 4;
// else NOP.
void
Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_ENDPGM_SAVED class methods ---
Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_endpgm_saved")
{
} // Inst_SOPP__S_ENDPGM_SAVED
Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
{
} // ~Inst_SOPP__S_ENDPGM_SAVED
// --- description from .arch file ---
// End of program; signal that a wave has been saved by the context-switch
// trap handler and terminate wavefront.
// The hardware implicitly executes S_WAITCNT 0 before executing this
// instruction.
// Use S_ENDPGM in all cases unless you are executing the context-switch
// save handler.
void
Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_SET_GPR_IDX_OFF class methods ---
Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_set_gpr_idx_off")
{
} // Inst_SOPP__S_SET_GPR_IDX_OFF
Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
{
} // ~Inst_SOPP__S_SET_GPR_IDX_OFF
// --- description from .arch file ---
// MODE.gpr_idx_en = 0.
// Clear GPR indexing mode. Vector operations after this will not perform
// --- relative GPR addressing regardless of the contents of M0. This
// --- instruction does not modify M0.
void
Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_SOPP__S_SET_GPR_IDX_MODE class methods ---
Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
{
} // Inst_SOPP__S_SET_GPR_IDX_MODE
Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
{
} // ~Inst_SOPP__S_SET_GPR_IDX_MODE
// --- description from .arch file ---
// M0[15:12] = SIMM4.
// Modify the mode used for vector GPR indexing.
// The raw contents of the source field are read and used to set the enable
// bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
// and SIMM4[3] = VDST_REL.
void
Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
} // namespace VegaISA
} // namespace gem5

View File

@@ -0,0 +1,115 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/vega/insts/instructions.hh"
namespace gem5
{
namespace VegaISA
{
// --- Inst_VINTRP__V_INTERP_P1_F32 class methods ---
Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
InFmt_VINTRP *iFmt)
: Inst_VINTRP(iFmt, "v_interp_p1_f32")
{
setFlag(ALU);
setFlag(F32);
} // Inst_VINTRP__V_INTERP_P1_F32
Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
{
} // ~Inst_VINTRP__V_INTERP_P1_F32
// --- description from .arch file ---
// D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
// V_MAD_F32 for SP).
// CAUTION: when in HALF_LDS mode, D must not be the same GPR as S;
// if D == S then data corruption will occur.
// NOTE: In textual representations the I/J VGPR is the first source and
// the attribute is the second source; however in the VOP3 encoding the
// attribute is stored in the src0 field and the VGPR is stored in the
// src1 field.
void
Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_VINTRP__V_INTERP_P2_F32 class methods ---
Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
InFmt_VINTRP *iFmt)
: Inst_VINTRP(iFmt, "v_interp_p2_f32")
{
setFlag(ALU);
setFlag(F32);
} // Inst_VINTRP__V_INTERP_P2_F32
Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
{
} // ~Inst_VINTRP__V_INTERP_P2_F32
// --- description from .arch file ---
// D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
// V_MAD_F32 for SP).
// NOTE: In textual representations the I/J VGPR is the first source and
// the attribute is the second source; however in the VOP3 encoding the
// attribute is stored in the src0 field and the VGPR is stored in the
// src1 field.
void
Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
// --- Inst_VINTRP__V_INTERP_MOV_F32 class methods ---
Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
InFmt_VINTRP *iFmt)
: Inst_VINTRP(iFmt, "v_interp_mov_f32")
{
setFlag(ALU);
setFlag(F32);
} // Inst_VINTRP__V_INTERP_MOV_F32
Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
{
} // ~Inst_VINTRP__V_INTERP_MOV_F32
// --- description from .arch file ---
// D.f = {P10,P20,P0}[S.u]; parameter load.
void
Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
} // execute
} // namespace VegaISA
} // namespace gem5

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -31,6 +31,7 @@
#include "arch/amdgpu/vega/insts/vop3p.hh"
#include "arch/amdgpu/vega/insts/instructions.hh"
#include "arch/arm/insts/fplib.hh"
namespace gem5
@@ -631,5 +632,236 @@ void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
vdst.write();
}
// --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)
: Inst_VOP3P(iFmt, "v_pk_fma_f32")
{
setFlag(ALU);
} // Inst_VOP3P__V_PK_FMA_F32
Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()
{
} // ~Inst_VOP3P__V_PK_FMA_F32
// D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
// S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
void
Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
{
// This is a special case of packed instructions which operates on
// 64-bit inputs/outputs and not 32-bit. U64 is used here as float
// values cannot use bitwise operations. Consider the U64 to imply
// untyped 64-bits of data.
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
VecOperandU64 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.readSrc();
src2.readSrc();
int opsel = instData.OPSEL;
int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
: bits(src2[lane], 31, 0);
float dword1 = std::fma(*reinterpret_cast<float*>(&s0l),
*reinterpret_cast<float*>(&s1l),
*reinterpret_cast<float*>(&s2l));
uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
: bits(src2[lane], 31, 0);
float dword2 = std::fma(*reinterpret_cast<float*>(&s0h),
*reinterpret_cast<float*>(&s1h),
*reinterpret_cast<float*>(&s2h));
uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
}
}
vdst.write();
} // execute
// --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)
: Inst_VOP3P(iFmt, "v_pk_mul_f32")
{
setFlag(ALU);
} // Inst_VOP3P__V_PK_MUL_F32
Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()
{
} // ~Inst_VOP3P__V_PK_MUL_F32
// D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
// S1.f[31:0]
void
Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
{
// This is a special case of packed instructions which operates on
// 64-bit inputs/outputs and not 32-bit. U64 is used here as float
// values cannot use bitwise operations. Consider the U64 to imply
// untyped 64-bits of data.
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
VecOperandU64 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.readSrc();
int opsel = instData.OPSEL;
int opsel_hi = extData.OPSEL_HI;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
float dword1 = *reinterpret_cast<float*>(&lower_dword)
* *reinterpret_cast<float*>(&upper_dword);
lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
float dword2 = *reinterpret_cast<float*>(&lower_dword)
* *reinterpret_cast<float*>(&upper_dword);
uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
}
}
vdst.write();
} // execute
// --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)
: Inst_VOP3P(iFmt, "v_pk_add_f32")
{
setFlag(ALU);
} // Inst_VOP3P__V_PK_ADD_F32
Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()
{
} // ~Inst_VOP3P__V_PK_ADD_F32
// D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
// S1.f[31:0]
void
Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
{
// This is a special case of packed instructions which operates on
// 64-bit inputs/outputs and not 32-bit. U64 is used here as float
// values cannot use bitwise operations. Consider the U64 to imply
// untyped 64-bits of data.
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
VecOperandU64 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.readSrc();
int opsel = instData.OPSEL;
int opsel_hi = extData.OPSEL_HI;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
float dword1 = *reinterpret_cast<float*>(&lower_dword)
+ *reinterpret_cast<float*>(&upper_dword);
lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
float dword2 = *reinterpret_cast<float*>(&lower_dword)
+ *reinterpret_cast<float*>(&upper_dword);
uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
}
}
vdst.write();
} // execute
// --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
: Inst_VOP3P(iFmt, "v_pk_mov_b32")
{
setFlag(ALU);
} // Inst_VOP3P__V_PK_MOV_B32
Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32()
{
} // ~Inst_VOP3P__V_PK_MOV_B32
// D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
void
Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
{
// This is a special case of packed instructions which operates on
// 64-bit inputs/outputs and not 32-bit.
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
VecOperandU64 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.readSrc();
// Only OPSEL[1:0] are used
// OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
int opsel = instData.OPSEL;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
// OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
vdst[lane] = upper_dword << 32 | lower_dword;
}
}
vdst.write();
} // execute
} // namespace VegaISA
} // namespace gem5

View File

@@ -0,0 +1,257 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/vega/insts/instructions.hh"
#include "arch/amdgpu/vega/insts/vop3p.hh"
namespace gem5
{
namespace VegaISA
{
// --- Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8 class methods ---
Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8(InFmt_VOP3P_MAI *iFmt)
: Inst_VOP3P_MAI(iFmt, "v_mfma_i32_16x16x16i8")
{
setFlag(ALU);
} // Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::
~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8()
{
} // ~Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8
// D(16x16I32) = A(16x16I8) x B(16x16I8) + C(16x16I32), 1 Blocks, 8
// pass, srcA/srcB 1 archVgpr, srcC/D 4 accVGPR
void
Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::execute(GPUDynInstPtr gpuDynInst)
{
int acc_offset = 0;
if (instData.ACC_CD) {
warn("ACC_CD not yet implemented\n");
}
// int8 size allows for 4 elements per lane. At 16x16 this means 4
// lanes per column (A matrix) / (B matrix). This whole matrix fits
// in one VGPR. The C matrix with size int32 requires 4 VGPRs.
// Handle the C matrix by using a delta. This is set to 1 normally to
// move to the next VGPR (1 dword away) and 0 if the input is a scalar
// reg (e.g., a constant).
int delta = isVectorReg(extData.SRC2) ? 1 : 0;
// VecOperandI8 will read 8 bits and sign extend, so used U32 to read
// as "untyped" 32-bit values.
ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
ConstVecOperandI32 src2a(gpuDynInst, extData.SRC2+acc_offset);
ConstVecOperandI32 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
ConstVecOperandI32 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
ConstVecOperandI32 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
VecOperandI32 vdsta(gpuDynInst, instData.VDST+acc_offset);
VecOperandI32 vdstb(gpuDynInst, instData.VDST+acc_offset+1);
VecOperandI32 vdstc(gpuDynInst, instData.VDST+acc_offset+2);
VecOperandI32 vdstd(gpuDynInst, instData.VDST+acc_offset+3);
src0.readSrc();
src1.readSrc();
src2a.readSrc();
src2b.readSrc();
src2c.readSrc();
src2d.readSrc();
int32_t A[16][16];
for (int i = 0; i < 64; ++i) {
// src0[0:15] contains columns 1 - 4 packed for rows 0 - 15,
// src0[16:31] contains columns 5 - 8 packed for rows 0 - 15,
// src0[32:47] contains columns 9 - 12 packed for rows 0 - 15,
// src0[48:63] contains columns 13 - 16 packed for rows 0 - 15,
int row = i % 16;
int start_col = (i / 16) * 4;
A[row][start_col+0] = sext<8>(bits(src0[i], 7, 0));
A[row][start_col+1] = sext<8>(bits(src0[i], 15, 8));
A[row][start_col+2] = sext<8>(bits(src0[i], 23, 16));
A[row][start_col+3] = sext<8>(bits(src0[i], 31, 24));
}
int32_t B[16][16];
for (int i = 0; i < 64; ++i) {
// src1[0:15] contains rows 1 - 4 packed for columns 0 - 15
// src1[16:31] contains rows 5 - 8 packed for columns 0 - 15
// src1[32:47] contains rows 9 - 12 packed for columns 0 - 15
// src1[48:63] contains rows 13 - 16 packed for columns 0 - 15
int start_row = (i / 16) * 4;
int col = i % 16;
B[start_row+0][col] = sext<8>(bits(src1[i], 7, 0));
B[start_row+1][col] = sext<8>(bits(src1[i], 15, 8));
B[start_row+2][col] = sext<8>(bits(src1[i], 23, 16));
B[start_row+3][col] = sext<8>(bits(src1[i], 31, 24));
}
int32_t result[16][16];
// Load accumulation matrix C into result
for (int i = 0; i < 64; ++i) {
// src2a contains rows 0, 4, 8, 12
result[(i/16)*4][(i%16)] = src2a[i];
// src2b contains rows 1, 5, 9, 13
result[(i/16)*4+1][(i%16)] = src2b[i];
// src2c contains rows 2, 6, 10, 14
result[(i/16)*4+2][(i%16)] = src2c[i];
// src2d contains rows 3, 7, 11, 15
result[(i/16)*4+3][(i%16)] = src2d[i];
}
// Compute new result - This is (obviously) not optimized
for (int i = 0; i < 16; ++i) {
for (int j = 0; j < 16; ++j) {
for (int k = 0; k < 16; ++k) {
result[i][j] += A[i][k] * B[k][j];
}
}
}
// Put result in dest VGPRs
for (int i = 0; i < 64; ++i) {
// vdsta contains rows 0, 4, 8, 12
vdsta[i] = result[(i/16)*4][(i%16)];
// vdstb contains rows 1, 5, 9, 13
vdstb[i] = result[(i/16)*4+1][(i%16)];
// vdstc contains rows 2, 6, 10, 14
vdstc[i] = result[(i/16)*4+2][(i%16)];
// vdstd contains rows 3, 7, 11, 15
vdstd[i] = result[(i/16)*4+3][(i%16)];
}
vdsta.write();
vdstb.write();
vdstc.write();
vdstd.write();
} // execute
// --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI *iFmt)
: Inst_VOP3P_MAI(iFmt, "v_mfma_f64_16x16x4f64")
{
setFlag(ALU);
} // Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64()
{
} // ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
// D(16x16F64) = A(16x4F64) x B(4x16F64) + C(16x16F64), 1 Blocks, 8
// pass, srcA/srcB 2 VGPR, srcC/D 8 VGPR
void
Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
{
int acc_offset = 0;
if (instData.ACC_CD) {
warn("ACC_CD not yet implemented\n");
}
// Handling of src2 is a bit tricky. The operator[] overload cannot
// be used for dword count > 2, and the dword count here is 8. Usually
// src2 is a VGPR/AccGPR, but it might also be constant. In order to
// use operator[] and handle constants, check for VGPR here and set
// a delta for each of the pairs of src2 GPRs.
int delta = isVectorReg(extData.SRC2) ? 2 : 0;
ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
src0.readSrc();
src1.readSrc();
src2a.readSrc();
src2b.readSrc();
src2c.readSrc();
src2d.readSrc();
double result[16][16];
// Load src2 into result. src2 is row major
for (int i = 0; i < 64; ++i) {
// src2a contains rows 0 - 3
result[(i/16)][(i%16)] = src2a[i];
// src2b contains rows 4 - 7
result[(i/16)+4][(i%16)] = src2b[i];
// src2c contains rows 8 - 11
result[(i/16)+8][(i%16)] = src2c[i];
// src2d contains rows 12 - 15
result[(i/16)+12][(i%16)] = src2d[i];
}
// Compute new result
for (int i = 0; i < 16; ++i) {
for (int j = 0; j < 16; ++j) {
for (int k = 0; k < 4; ++k) {
// src0 is column major, src1 is row major
int lane_A = 16*k + i;
int lane_B = 16*k + j;
result[i][j] += src0[lane_A] * src1[lane_B];
}
}
}
// Put result in dest VGPRs
for (int i = 0; i < 64; ++i) {
// vdsta contains rows 0 - 3
vdsta[i] = result[(i/16)][(i%16)];
// src2b contains rows 4 - 7
vdstb[i] = result[(i/16)+4][(i%16)];
// src2c contains rows 8 - 11
vdstc[i] = result[(i/16)+8][(i%16)];
// src2d contains rows 12 - 15
vdstd[i] = result[(i/16)+12][(i%16)];
}
vdsta.write();
vdstb.write();
vdstc.write();
vdstd.write();
} // execute
} // namespace VegaISA
} // namespace gem5

File diff suppressed because it is too large Load Diff