38 #include "debug/GPUExec.hh"
39 #include "debug/WavefrontStack.hh"
46 WavefrontParams::create()
52 :
SimObject(p), callArgMem(nullptr), _gpuISA()
89 for (
int i = 0;
i < 3; ++
i) {
101 .
name(
name() +
".src_reg_operand_dist")
102 .
desc(
"number of executed instructions with N source register operands")
107 .
name(
name() +
".dst_reg_operand_dist")
108 .
desc(
"number of executed instructions with N destination register "
114 .
name(
name() +
".timesBlockedDueWAXDependencies")
115 .
desc(
"number of times the wf's instructions are blocked due to WAW "
116 "or WAR dependencies")
121 .
name(
name() +
".timesBlockedDueRAWDependencies")
122 .
desc(
"number of times the wf's instructions are blocked due to RAW "
128 .
name(
name() +
".timesBlockedDueVrfPortAvail")
129 .
desc(
"number of times instructions are blocked due to VRF port "
167 if (ii->isGlobalMem() || ii->isFlat())
176 if (ii->isLocalMem()) {
190 ii->isReturn() || ii->isBranch() ||
191 ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
271 if (ii->isReturn() || ii->isBranch()) {
291 if (mode == 1 && size > 4) {
328 bool glbMemBusRdy =
false;
329 bool glbMemIssueRdy =
false;
335 glbMemIssueRdy =
true;
338 bool locMemBusRdy =
false;
339 bool locMemIssueRdy =
false;
345 locMemIssueRdy =
true;
353 if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
354 ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
355 ii->isMemFence() || ii->isFlat())) {
356 panic(
"next instruction: %s is of unknown type\n", ii->disassemble());
359 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
362 if (type ==
I_ALU && ii->isBarrier()) {
375 }
else if (type ==
I_ALU && ii->isNop()) {
383 }
else if (type ==
I_ALU && ii->isReturn()) {
396 }
else if (type ==
I_ALU && (ii->isBranch() ||
398 (ii->isKernArgSeg() && ii->isLoad()) ||
414 }
else if (type ==
I_GLOBAL && ii->isGlobalMem()) {
416 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
423 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
429 if (!glbMemIssueRdy) {
453 }
else if (type ==
I_SHARED && ii->isLocalMem()) {
455 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
461 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
471 if (!locMemIssueRdy) {
490 }
else if (type ==
I_FLAT && ii->isFlat()) {
501 if (!glbMemIssueRdy) {
506 if (!locMemIssueRdy) {
549 if (ii->isALU() || ii->isSpecialOp() ||
554 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
561 }
else if (ii->isBarrier()) {
564 }
else if (ii->isLoad() && ii->isFlat()) {
565 assert(Enums::SC_NONE != ii->executedAs());
568 if ( Enums::SC_SHARED == ii->executedAs() ) {
579 }
else if (ii->isStore() && ii->isFlat()) {
580 assert(Enums::SC_NONE != ii->executedAs());
583 if (Enums::SC_SHARED == ii->executedAs()) {
594 }
else if (ii->isLoad() && ii->isGlobalMem()) {
601 }
else if (ii->isStore() && ii->isGlobalMem()) {
608 }
else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
616 }
else if (ii->isLoad() && ii->isLocalMem()) {
623 }
else if (ii->isStore() && ii->isLocalMem()) {
630 }
else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
655 const uint32_t old_pc =
pc();
656 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
658 ii->disassemble(), old_pc);
672 if (
pc() == old_pc) {
673 uint32_t new_pc =
_gpuISA.advancePC(old_pc, ii);
676 if (new_pc ==
rpc()) {
687 const int num_active_lanes =
execMask().count();
699 if (ii->isALU() || ii->isSpecialOp() ||
704 (ii->isKernArgSeg() && ii->isLoad()) ||
713 }
else if (ii->isBarrier()) {
716 }
else if (ii->isLoad() && ii->isFlat()) {
717 assert(Enums::SC_NONE != ii->executedAs());
719 if (Enums::SC_SHARED == ii->executedAs()) {
730 }
else if (ii->isStore() && ii->isFlat()) {
731 assert(Enums::SC_NONE != ii->executedAs());
732 if (Enums::SC_SHARED == ii->executedAs()) {
743 }
else if (ii->isLoad() && ii->isGlobalMem()) {
748 }
else if (ii->isStore() && ii->isGlobalMem()) {
753 }
else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
758 }
else if (ii->isLoad() && ii->isLocalMem()) {
763 }
else if (ii->isStore() && ii->isLocalMem()) {
768 }
else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
786 assert(mask.count());
795 DPRINTF(WavefrontStack,
"[%2d, %2d, %2d, %2d] %s %3i => ",
797 execMask().to_string<
char, std::string::traits_type,
798 std::string::allocator_type>().c_str(),
pc());
802 DPRINTF(WavefrontStack,
"%3i %s\n",
pc(),
803 execMask().to_string<
char, std::string::traits_type,
804 std::string::allocator_type>().c_str());
859 uint8_t *iter = (uint8_t *)out;
860 for (
int i = 0;
i <
barCnt.size();
i++) {
863 *(
int *)iter =
wfId; iter +=
sizeof(
wfId);
868 *(uint32_t *)iter =
wgId; iter +=
sizeof(
wgId);
870 *(uint64_t *)iter =
initMask.to_ullong(); iter +=
sizeof(
initMask.to_ullong());
876 std::numeric_limits<uint32_t>::max(),
877 std::numeric_limits<uint64_t>::max()};
891 uint32_t vgprIdx =
remap(
i,
sizeof(uint32_t), 1);
892 for (
int lane = 0; lane < wf_size; lane++) {
894 read<uint32_t>(vgprIdx,lane);
895 *(uint32_t *)iter = regVal; iter +=
sizeof(regVal);
900 uint32_t vgprIdx =
remap(
i,
sizeof(uint64_t), 1);
901 for (
int lane = 0; lane < wf_size; lane++) {
903 read<uint64_t>(vgprIdx,lane);
904 *(uint64_t *)iter = regVal; iter +=
sizeof(regVal);
909 for (
int lane = 0; lane < wf_size; lane++) {
911 *(uint64_t *)iter = regVal; iter +=
sizeof(regVal);
919 *(
char *) iter = val; iter +=
sizeof(
val);
926 uint8_t *iter = (uint8_t *)in;
927 for (
int i = 0;
i <
barCnt.size();
i++) {
930 wfId = *(
int *)iter; iter +=
sizeof(
wfId);
935 wgId = *(uint32_t *)iter; iter +=
sizeof(
wgId);
944 if (newEntry.
pc != std::numeric_limits<uint32_t>::max()) {
952 uint32_t vgprIdx =
remap(
i,
sizeof(uint32_t), 1);
953 for (
int lane = 0; lane < wf_size; lane++) {
954 uint32_t regVal = *(uint32_t *)iter; iter +=
sizeof(regVal);
960 uint32_t vgprIdx =
remap(
i,
sizeof(uint64_t), 1);
961 for (
int lane = 0; lane < wf_size; lane++) {
962 uint64_t regVal = *(uint64_t *)iter; iter +=
sizeof(regVal);
968 for (
int lane = 0; lane < wf_size; lane++) {
969 uint64_t regVal = *(uint64_t *)iter; iter +=
sizeof(regVal);
976 char val = *(
char *) iter; iter +=
sizeof(
val);
985 for (
int d = 0;
d < 3; ++
d) {
Counter value() const
Return the current value of this stat as its base type.
std::vector< uint32_t > oldVgpr
Tick ticks(int numCycles) const
Stats::Scalar numTimesBlockedDueRAWDependencies
void setContext(const void *in)
Sets the hardware context fromt a stream of bytes This method is designed for HSAIL execution...
void write(int regIdx, int threadId, T value)
Stats::Scalar numTimesBlockedDueVrfPortAvail
std::deque< std::unique_ptr< ReconvergenceStackEntry > > reconvergenceStack
Stack containing Control Flow Graph nodes (i.e., kernel instructions) to be visited by the wavefront...
Stats::Distribution controlFlowDivergenceDist
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
void init(uint32_t _size)
T read(int regIdx, int threadId)
class ConditionRegisterState * condRegState
bool isOldestInstFlatMem()
bool isOldestInstPrivMem()
virtual void regStats()
Register statistics for this object.
Stats::Scalar numTimesBlockedDueWAXDependencies
Stats::Scalar numInstrExecuted
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
std::vector< WaitClass > vrfToLocalMemPipeBus
bool instructionBufferHasBranch()
std::vector< WaitClass > aluPipe
GlobalMemPipeline globalMemoryPipe
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Distribution srcRegOpDist
std::vector< uint32_t > workItemId[3]
uint32_t pc
PC of current instruction.
std::deque< GPUDynInstPtr > instructionBuffer
Stats::Distribution execRateDist
void regStats()
Register statistics for this object.
std::vector< uint32_t > workItemFlatId
Wavefront(const Params *p)
uint32_t getStaticContextSize() const
Returns the size of the static hardware context of a particular wavefront This should be updated ever...
std::vector< WaitClass > vrfToGlobalMemPipeBus
void updateInstStats(GPUDynInstPtr gpuDynInst)
std::vector< int > barCnt
ComputeUnit * computeUnit
Stats::Distribution dstRegOpDist
bool isLmInstruction(GPUDynInstPtr ii)
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
uint32_t outstandingReqsWrLm
void getContext(const void *out)
Returns the hardware context as a stream of bytes This method is designed for HSAIL execution...
uint32_t outstandingReqsRdGm
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void computeActualWgSz(NDRange *ndr)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
T read(const uint32_t index)
a read operation
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
uint32_t outstandingReqsRdLm
static const int NumArgumentRegs M5_VAR_USED
bool isOldestInstBarrier()
uint32_t outstandingReqsWrGm
bool isGmInstruction(GPUDynInstPtr ii)
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
std::vector< Addr > lastAddr
TheGpuISA::GPUISA _gpuISA
Stats::Distribution activeLanesPerGMemInstrDist
virtual const std::string name() const
VectorMask execMask() const
A reconvergence stack entry conveys the necessary state to implement control flow divergence...
Stats::Scalar numVecOpsExecuted
std::vector< VectorRegisterFile * > vrf
void start(uint64_t _wfDynId, uint64_t _base_ptr)
std::vector< uint64_t > oldDgpr
void popFromReconvergenceStack()
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
void write(const uint32_t index, const T value)
a write operation
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
std::vector< uint64_t > lastExecCycle
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0)
std::vector< uint8_t >::size_type size() const
get the size of this chunk
Stats::Scalar totalCycles
Abstract superclass for simulation objects.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
uint32_t rpc
PC of the immediate post-dominator instruction, i.e., the value of pc for the first instruction that ...
bool waitingAtBarrier(int lane)
VectorMask execMask
Execution mask.