this patch adds an ordered response buffer to the GM pipeline to ensure in-order data delivery. the buffer is implemented as a stl ordered map, which sorts the request in program order by using their sequence ID. when requests return to the GM pipeline they are marked as done. only the oldest request may be serviced from the ordered buffer, and only if is marked as done. the FIFO response buffers are kept and used in OoO delivery mode
792 lines
23 KiB
C++
792 lines
23 KiB
C++
/*
|
|
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* Author: Marc Orr
|
|
*/
|
|
|
|
#include <csignal>
|
|
|
|
#include "arch/hsail/insts/decl.hh"
|
|
#include "arch/hsail/insts/mem.hh"
|
|
|
|
namespace HsailISA
|
|
{
|
|
// Pseudo (or magic) instructions are overloaded on the hsail call
|
|
// instruction, because of its flexible parameter signature.
|
|
|
|
// To add a new magic instruction:
|
|
// 1. Add an entry to the enum.
|
|
// 2. Implement it in the switch statement below (Call::exec).
|
|
// 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
|
|
// so its easy to call from an OpenCL kernel.
|
|
|
|
// This enum should be identical to the enum in
|
|
// hsa/hsail-gpu-compute/util/magicinst.h
|
|
enum
|
|
{
|
|
MAGIC_PRINT_WF_32 = 0,
|
|
MAGIC_PRINT_WF_64,
|
|
MAGIC_PRINT_LANE,
|
|
MAGIC_PRINT_LANE_64,
|
|
MAGIC_PRINT_WF_FLOAT,
|
|
MAGIC_SIM_BREAK,
|
|
MAGIC_PREF_SUM,
|
|
MAGIC_REDUCTION,
|
|
MAGIC_MASKLANE_LOWER,
|
|
MAGIC_MASKLANE_UPPER,
|
|
MAGIC_JOIN_WF_BAR,
|
|
MAGIC_WAIT_WF_BAR,
|
|
MAGIC_PANIC,
|
|
MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
|
|
MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
|
|
MAGIC_LOAD_GLOBAL_U32_REG,
|
|
MAGIC_XACT_CAS_LD,
|
|
MAGIC_MOST_SIG_THD,
|
|
MAGIC_MOST_SIG_BROADCAST,
|
|
MAGIC_PRINT_WFID_32,
|
|
MAGIC_PRINT_WFID_64
|
|
};
|
|
|
|
void
|
|
Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
|
|
{
|
|
const VectorMask &mask = w->getPred();
|
|
|
|
int op = 0;
|
|
bool got_op = false;
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
int src_val0 = src1.get<int>(w, lane, 0);
|
|
if (got_op) {
|
|
if (src_val0 != op) {
|
|
fatal("Multiple magic instructions per PC not "
|
|
"supported\n");
|
|
}
|
|
} else {
|
|
op = src_val0;
|
|
got_op = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
switch(op) {
|
|
case MAGIC_PRINT_WF_32:
|
|
MagicPrintWF32(w);
|
|
break;
|
|
case MAGIC_PRINT_WF_64:
|
|
MagicPrintWF64(w);
|
|
break;
|
|
case MAGIC_PRINT_LANE:
|
|
MagicPrintLane(w);
|
|
break;
|
|
case MAGIC_PRINT_LANE_64:
|
|
MagicPrintLane64(w);
|
|
break;
|
|
case MAGIC_PRINT_WF_FLOAT:
|
|
MagicPrintWFFloat(w);
|
|
break;
|
|
case MAGIC_SIM_BREAK:
|
|
MagicSimBreak(w);
|
|
break;
|
|
case MAGIC_PREF_SUM:
|
|
MagicPrefixSum(w);
|
|
break;
|
|
case MAGIC_REDUCTION:
|
|
MagicReduction(w);
|
|
break;
|
|
case MAGIC_MASKLANE_LOWER:
|
|
MagicMaskLower(w);
|
|
break;
|
|
case MAGIC_MASKLANE_UPPER:
|
|
MagicMaskUpper(w);
|
|
break;
|
|
case MAGIC_JOIN_WF_BAR:
|
|
MagicJoinWFBar(w);
|
|
break;
|
|
case MAGIC_WAIT_WF_BAR:
|
|
MagicWaitWFBar(w);
|
|
break;
|
|
case MAGIC_PANIC:
|
|
MagicPanic(w);
|
|
break;
|
|
|
|
// atomic instructions
|
|
case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
|
|
MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
|
|
break;
|
|
|
|
case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
|
|
MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
|
|
break;
|
|
|
|
case MAGIC_LOAD_GLOBAL_U32_REG:
|
|
MagicLoadGlobalU32Reg(w, gpuDynInst);
|
|
break;
|
|
|
|
case MAGIC_XACT_CAS_LD:
|
|
MagicXactCasLd(w);
|
|
break;
|
|
|
|
case MAGIC_MOST_SIG_THD:
|
|
MagicMostSigThread(w);
|
|
break;
|
|
|
|
case MAGIC_MOST_SIG_BROADCAST:
|
|
MagicMostSigBroadcast(w);
|
|
break;
|
|
|
|
case MAGIC_PRINT_WFID_32:
|
|
MagicPrintWF32ID(w);
|
|
break;
|
|
|
|
case MAGIC_PRINT_WFID_64:
|
|
MagicPrintWFID64(w);
|
|
break;
|
|
|
|
default: fatal("unrecognized magic instruction: %d\n", op);
|
|
}
|
|
}
|
|
|
|
void
|
|
Call::MagicPrintLane(Wavefront *w)
|
|
{
|
|
#if TRACING_ON
|
|
const VectorMask &mask = w->getPred();
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
int src_val1 = src1.get<int>(w, lane, 1);
|
|
int src_val2 = src1.get<int>(w, lane, 2);
|
|
if (src_val2) {
|
|
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
|
|
disassemble(), w->computeUnit->cu_id, w->simdId,
|
|
w->wfSlotId, lane, src_val1);
|
|
} else {
|
|
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
|
|
disassemble(), w->computeUnit->cu_id, w->simdId,
|
|
w->wfSlotId, lane, src_val1);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void
|
|
Call::MagicPrintLane64(Wavefront *w)
|
|
{
|
|
#if TRACING_ON
|
|
const VectorMask &mask = w->getPred();
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
|
|
int src_val2 = src1.get<int>(w, lane, 2);
|
|
if (src_val2) {
|
|
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
|
|
disassemble(), w->computeUnit->cu_id, w->simdId,
|
|
w->wfSlotId, lane, src_val1);
|
|
} else {
|
|
DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
|
|
disassemble(), w->computeUnit->cu_id, w->simdId,
|
|
w->wfSlotId, lane, src_val1);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void
|
|
Call::MagicPrintWF32(Wavefront *w)
|
|
{
|
|
#if TRACING_ON
|
|
const VectorMask &mask = w->getPred();
|
|
std::string res_str;
|
|
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (!(lane & 7)) {
|
|
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
|
}
|
|
|
|
if (mask[lane]) {
|
|
int src_val1 = src1.get<int>(w, lane, 1);
|
|
int src_val2 = src1.get<int>(w, lane, 2);
|
|
|
|
if (src_val2) {
|
|
res_str += csprintf("%08x", src_val1);
|
|
} else {
|
|
res_str += csprintf("%08d", src_val1);
|
|
}
|
|
} else {
|
|
res_str += csprintf("xxxxxxxx");
|
|
}
|
|
|
|
if ((lane & 7) == 7) {
|
|
res_str += csprintf("\n");
|
|
} else {
|
|
res_str += csprintf(" ");
|
|
}
|
|
}
|
|
|
|
res_str += "\n\n";
|
|
DPRINTFN(res_str.c_str());
|
|
#endif
|
|
}
|
|
|
|
void
|
|
Call::MagicPrintWF32ID(Wavefront *w)
|
|
{
|
|
#if TRACING_ON
|
|
const VectorMask &mask = w->getPred();
|
|
std::string res_str;
|
|
int src_val3 = -1;
|
|
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (!(lane & 7)) {
|
|
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
|
}
|
|
|
|
if (mask[lane]) {
|
|
int src_val1 = src1.get<int>(w, lane, 1);
|
|
int src_val2 = src1.get<int>(w, lane, 2);
|
|
src_val3 = src1.get<int>(w, lane, 3);
|
|
|
|
if (src_val2) {
|
|
res_str += csprintf("%08x", src_val1);
|
|
} else {
|
|
res_str += csprintf("%08d", src_val1);
|
|
}
|
|
} else {
|
|
res_str += csprintf("xxxxxxxx");
|
|
}
|
|
|
|
if ((lane & 7) == 7) {
|
|
res_str += csprintf("\n");
|
|
} else {
|
|
res_str += csprintf(" ");
|
|
}
|
|
}
|
|
|
|
res_str += "\n\n";
|
|
if (w->wfDynId == src_val3) {
|
|
DPRINTFN(res_str.c_str());
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void
|
|
Call::MagicPrintWF64(Wavefront *w)
|
|
{
|
|
#if TRACING_ON
|
|
const VectorMask &mask = w->getPred();
|
|
std::string res_str;
|
|
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (!(lane & 3)) {
|
|
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
|
}
|
|
|
|
if (mask[lane]) {
|
|
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
|
|
int src_val2 = src1.get<int>(w, lane, 2);
|
|
|
|
if (src_val2) {
|
|
res_str += csprintf("%016x", src_val1);
|
|
} else {
|
|
res_str += csprintf("%016d", src_val1);
|
|
}
|
|
} else {
|
|
res_str += csprintf("xxxxxxxxxxxxxxxx");
|
|
}
|
|
|
|
if ((lane & 3) == 3) {
|
|
res_str += csprintf("\n");
|
|
} else {
|
|
res_str += csprintf(" ");
|
|
}
|
|
}
|
|
|
|
res_str += "\n\n";
|
|
DPRINTFN(res_str.c_str());
|
|
#endif
|
|
}
|
|
|
|
void
|
|
Call::MagicPrintWFID64(Wavefront *w)
|
|
{
|
|
#if TRACING_ON
|
|
const VectorMask &mask = w->getPred();
|
|
std::string res_str;
|
|
int src_val3 = -1;
|
|
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (!(lane & 3)) {
|
|
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
|
}
|
|
|
|
if (mask[lane]) {
|
|
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
|
|
int src_val2 = src1.get<int>(w, lane, 2);
|
|
src_val3 = src1.get<int>(w, lane, 3);
|
|
|
|
if (src_val2) {
|
|
res_str += csprintf("%016x", src_val1);
|
|
} else {
|
|
res_str += csprintf("%016d", src_val1);
|
|
}
|
|
} else {
|
|
res_str += csprintf("xxxxxxxxxxxxxxxx");
|
|
}
|
|
|
|
if ((lane & 3) == 3) {
|
|
res_str += csprintf("\n");
|
|
} else {
|
|
res_str += csprintf(" ");
|
|
}
|
|
}
|
|
|
|
res_str += "\n\n";
|
|
if (w->wfDynId == src_val3) {
|
|
DPRINTFN(res_str.c_str());
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void
|
|
Call::MagicPrintWFFloat(Wavefront *w)
|
|
{
|
|
#if TRACING_ON
|
|
const VectorMask &mask = w->getPred();
|
|
std::string res_str;
|
|
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (!(lane & 7)) {
|
|
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
|
}
|
|
|
|
if (mask[lane]) {
|
|
float src_val1 = src1.get<float>(w, lane, 1);
|
|
res_str += csprintf("%08f", src_val1);
|
|
} else {
|
|
res_str += csprintf("xxxxxxxx");
|
|
}
|
|
|
|
if ((lane & 7) == 7) {
|
|
res_str += csprintf("\n");
|
|
} else {
|
|
res_str += csprintf(" ");
|
|
}
|
|
}
|
|
|
|
res_str += "\n\n";
|
|
DPRINTFN(res_str.c_str());
|
|
#endif
|
|
}
|
|
|
|
// raises a signal that GDB will catch
|
|
// when done with the break, type "signal 0" in gdb to continue
|
|
void
|
|
Call::MagicSimBreak(Wavefront *w)
|
|
{
|
|
std::string res_str;
|
|
// print out state for this wavefront and then break
|
|
res_str = csprintf("Breakpoint encountered for wavefront %i\n",
|
|
w->wfSlotId);
|
|
|
|
res_str += csprintf(" Kern ID: %i\n", w->kernId);
|
|
res_str += csprintf(" Phase ID: %i\n", w->simdId);
|
|
res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
|
|
res_str += csprintf(" Exec mask: ");
|
|
|
|
for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) {
|
|
if (w->execMask(i))
|
|
res_str += "1";
|
|
else
|
|
res_str += "0";
|
|
|
|
if ((i & 7) == 7)
|
|
res_str += " ";
|
|
}
|
|
|
|
res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
|
|
|
|
res_str += "\nHelpful debugging hints:\n";
|
|
res_str += " Check out w->s_reg / w->d_reg for register state\n";
|
|
|
|
res_str += "\n\n";
|
|
DPRINTFN(res_str.c_str());
|
|
fflush(stdout);
|
|
|
|
raise(SIGTRAP);
|
|
}
|
|
|
|
void
|
|
Call::MagicPrefixSum(Wavefront *w)
|
|
{
|
|
const VectorMask &mask = w->getPred();
|
|
int res = 0;
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
int src_val1 = src1.get<int>(w, lane, 1);
|
|
dest.set<int>(w, lane, res);
|
|
res += src_val1;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
Call::MagicReduction(Wavefront *w)
|
|
{
|
|
// reduction magic instruction
|
|
// The reduction instruction takes up to 64 inputs (one from
|
|
// each thread in a WF) and sums them. It returns the sum to
|
|
// each thread in the WF.
|
|
const VectorMask &mask = w->getPred();
|
|
int res = 0;
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
int src_val1 = src1.get<int>(w, lane, 1);
|
|
res += src_val1;
|
|
}
|
|
}
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
dest.set<int>(w, lane, res);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
Call::MagicMaskLower(Wavefront *w)
|
|
{
|
|
const VectorMask &mask = w->getPred();
|
|
int res = 0;
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
int src_val1 = src1.get<int>(w, lane, 1);
|
|
|
|
if (src_val1) {
|
|
if (lane < (w->computeUnit->wfSize()/2)) {
|
|
res = res | ((uint32_t)(1) << lane);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
dest.set<int>(w, lane, res);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
Call::MagicMaskUpper(Wavefront *w)
|
|
{
|
|
const VectorMask &mask = w->getPred();
|
|
int res = 0;
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
int src_val1 = src1.get<int>(w, lane, 1);
|
|
|
|
if (src_val1) {
|
|
if (lane >= (w->computeUnit->wfSize()/2)) {
|
|
res = res | ((uint32_t)(1) <<
|
|
(lane - (w->computeUnit->wfSize()/2)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
dest.set<int>(w, lane, res);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
Call::MagicJoinWFBar(Wavefront *w)
|
|
{
|
|
const VectorMask &mask = w->getPred();
|
|
int max_cnt = 0;
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
w->barCnt[lane]++;
|
|
|
|
if (w->barCnt[lane] > max_cnt) {
|
|
max_cnt = w->barCnt[lane];
|
|
}
|
|
}
|
|
}
|
|
|
|
if (max_cnt > w->maxBarCnt) {
|
|
w->maxBarCnt = max_cnt;
|
|
}
|
|
}
|
|
|
|
void
|
|
Call::MagicWaitWFBar(Wavefront *w)
|
|
{
|
|
const VectorMask &mask = w->getPred();
|
|
int max_cnt = 0;
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
w->barCnt[lane]--;
|
|
}
|
|
|
|
if (w->barCnt[lane] > max_cnt) {
|
|
max_cnt = w->barCnt[lane];
|
|
}
|
|
}
|
|
|
|
if (max_cnt < w->maxBarCnt) {
|
|
w->maxBarCnt = max_cnt;
|
|
}
|
|
|
|
w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
|
|
w->instructionBuffer.end());
|
|
if (w->pendingFetch)
|
|
w->dropFetch = true;
|
|
}
|
|
|
|
void
|
|
Call::MagicPanic(Wavefront *w)
|
|
{
|
|
const VectorMask &mask = w->getPred();
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
int src_val1 = src1.get<int>(w, lane, 1);
|
|
panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
|
|
src_val1, lane);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
|
|
{
|
|
// the address is in src1 | src2
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
int src_val1 = src1.get<int>(w, lane, 1);
|
|
int src_val2 = src1.get<int>(w, lane, 2);
|
|
Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
|
|
|
|
m->addr[lane] = addr;
|
|
}
|
|
|
|
}
|
|
|
|
void
|
|
Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
|
|
{
|
|
GPUDynInstPtr m = gpuDynInst;
|
|
|
|
calcAddr(w, m);
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
|
|
}
|
|
|
|
setFlag(AtomicNoReturn);
|
|
setFlag(AtomicAdd);
|
|
setFlag(NoScope);
|
|
setFlag(NoOrder);
|
|
setFlag(GlobalSegment);
|
|
|
|
m->m_type = U32::memType;
|
|
m->v_type = U32::vgprType;
|
|
|
|
m->exec_mask = w->execMask();
|
|
m->statusBitVector = 0;
|
|
m->equiv = 0; // atomics don't have an equivalence class operand
|
|
m->n_reg = 1;
|
|
|
|
m->simdId = w->simdId;
|
|
m->wfSlotId = w->wfSlotId;
|
|
m->wfDynId = w->wfDynId;
|
|
m->latency.init(&w->computeUnit->shader->tick_cnt);
|
|
|
|
m->pipeId = GLBMEM_PIPE;
|
|
m->latency.set(w->computeUnit->shader->ticks(64));
|
|
w->computeUnit->globalMemoryPipe.issueRequest(m);
|
|
w->outstandingReqsWrGm++;
|
|
w->wrGmReqsInPipe--;
|
|
w->outstandingReqsRdGm++;
|
|
w->rdGmReqsInPipe--;
|
|
w->outstandingReqs++;
|
|
w->memReqsInPipe--;
|
|
}
|
|
|
|
void
|
|
Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
|
|
{
|
|
GPUDynInstPtr m = gpuDynInst;
|
|
calcAddr(w, m);
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
|
|
}
|
|
|
|
setFlag(AtomicNoReturn);
|
|
setFlag(AtomicAdd);
|
|
setFlag(NoScope);
|
|
setFlag(NoOrder);
|
|
setFlag(GlobalSegment);
|
|
|
|
m->m_type = U32::memType;
|
|
m->v_type = U32::vgprType;
|
|
|
|
m->exec_mask = w->execMask();
|
|
m->statusBitVector = 0;
|
|
m->equiv = 0; // atomics don't have an equivalence class operand
|
|
m->n_reg = 1;
|
|
|
|
m->simdId = w->simdId;
|
|
m->wfSlotId = w->wfSlotId;
|
|
m->wfDynId = w->wfDynId;
|
|
m->latency.init(&w->computeUnit->shader->tick_cnt);
|
|
|
|
m->pipeId = GLBMEM_PIPE;
|
|
m->latency.set(w->computeUnit->shader->ticks(64));
|
|
w->computeUnit->globalMemoryPipe.issueRequest(m);
|
|
w->outstandingReqsWrGm++;
|
|
w->wrGmReqsInPipe--;
|
|
w->outstandingReqsRdGm++;
|
|
w->rdGmReqsInPipe--;
|
|
w->outstandingReqs++;
|
|
w->memReqsInPipe--;
|
|
}
|
|
|
|
void
|
|
Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
|
|
{
|
|
GPUDynInstPtr m = gpuDynInst;
|
|
// calculate the address
|
|
calcAddr(w, m);
|
|
|
|
setFlag(Load);
|
|
setFlag(NoScope);
|
|
setFlag(NoOrder);
|
|
setFlag(GlobalSegment);
|
|
|
|
m->m_type = U32::memType; //MemDataType::memType;
|
|
m->v_type = U32::vgprType; //DestDataType::vgprType;
|
|
|
|
m->exec_mask = w->execMask();
|
|
m->statusBitVector = 0;
|
|
m->equiv = 0;
|
|
m->n_reg = 1;
|
|
|
|
// FIXME
|
|
//m->dst_reg = this->dest.regIndex();
|
|
|
|
m->simdId = w->simdId;
|
|
m->wfSlotId = w->wfSlotId;
|
|
m->wfDynId = w->wfDynId;
|
|
m->latency.init(&w->computeUnit->shader->tick_cnt);
|
|
|
|
m->pipeId = GLBMEM_PIPE;
|
|
m->latency.set(w->computeUnit->shader->ticks(1));
|
|
w->computeUnit->globalMemoryPipe.issueRequest(m);
|
|
w->outstandingReqsRdGm++;
|
|
w->rdGmReqsInPipe--;
|
|
w->outstandingReqs++;
|
|
w->memReqsInPipe--;
|
|
}
|
|
|
|
void
|
|
Call::MagicXactCasLd(Wavefront *w)
|
|
{
|
|
const VectorMask &mask = w->getPred();
|
|
int src_val1 = 0;
|
|
|
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
src_val1 = src1.get<int>(w, lane, 1);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
|
|
w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
|
|
w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
|
|
}
|
|
|
|
w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
|
|
.push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
|
|
}
|
|
|
|
void
|
|
Call::MagicMostSigThread(Wavefront *w)
|
|
{
|
|
const VectorMask &mask = w->getPred();
|
|
unsigned mst = true;
|
|
|
|
for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
|
|
if (mask[lane]) {
|
|
dest.set<int>(w, lane, mst);
|
|
mst = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
Call::MagicMostSigBroadcast(Wavefront *w)
|
|
{
|
|
const VectorMask &mask = w->getPred();
|
|
int res = 0;
|
|
bool got_res = false;
|
|
|
|
for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
|
|
if (mask[lane]) {
|
|
if (!got_res) {
|
|
res = src1.get<int>(w, lane, 1);
|
|
got_res = true;
|
|
}
|
|
dest.set<int>(w, lane, res);
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace HsailISA
|