40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUExec.hh"
42 #include "debug/GPUFetch.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUPort.hh"
45 #include "debug/GPUPrefetch.hh"
46 #include "debug/GPUSync.hh"
47 #include "debug/GPUTLB.hh"
60 scoreboardCheckStage(p), scheduleStage(p), execStage(p),
61 globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
62 cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
63 spBypassPipeLength(p->spbypass_pipe_length),
64 dpBypassPipeLength(p->dpbypass_pipe_length),
65 issuePeriod(p->issue_period),
66 numGlbMemUnits(p->num_global_mem_pipes),
67 numLocMemUnits(p->num_shared_mem_pipes),
68 perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
69 prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
70 xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
71 functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
72 countPages(p->countPages), barrier_id(0),
73 vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
74 coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
75 req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
76 resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
77 _masterId(p->
system->getMasterId(
name() +
".ComputeUnit")),
78 lds(*p->localDataStore), _cacheLineSize(p->
system->cacheLineSize()),
79 globalSeqNum(0), wavefrontSize(p->wfSize),
91 fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
93 "WF size is larger than the host can support");
95 "Wavefront size should be a power of 2");
99 (uint32_t)ceil((
double)(
wfSize() *
sizeof(uint32_t)) /
111 for (
int i = 0;
i < p->n_wf; ++
i) {
114 wfList[
j].push_back(p->wavefronts[
j * p->n_wf +
i]);
133 if (p->execPolicy ==
"OLDEST-FIRST") {
135 }
else if (p->execPolicy ==
"ROUND-ROBIN") {
138 fatal(
"Invalid WF execution policy (CU)\n");
153 for (
int i = 0;
i <
vrf.size(); ++
i) {
154 vrf[
i]->setParent(
this);
204 while (i < vecSize) {
207 vrf[regInfo.first]->markReg(regInfo.second,
sizeof(uint32_t),
220 vrf[
i]->updateEvents();
229 static int _n_wave = 0;
241 w->
initMask = init_mask.to_ullong();
286 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
303 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: "
325 gpuDynInst->useContinuation =
false;
348 uint32_t normSize = 0;
351 allocateRegion(vregDemand, &normSize);
368 int trueWgSizeTotal = 1;
370 for (
int d = 0;
d < 3; ++
d) {
374 trueWgSizeTotal *= trueWgSize[
d];
375 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n", d, trueWgSize[d]);
378 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
383 bool vregAvail =
true;
384 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
389 int numMappedWfs = 0;
398 if (numMappedWfs < numWfs) {
409 if (freeWfSlots >= numWfs) {
414 vregAvail =
vrf[
j]->manager->canAllocate(numWfsPerSimd[
j],
426 DPRINTF(GPUDisp,
"Free WF slots = %d, VGPR Availability = %d\n",
427 freeWfSlots, vregAvail);
448 DPRINTF(GPUSync,
"CU%d: Checking for All At Barrier\n",
cu_id);
451 for (
int i_simd = 0; i_simd <
numSIMDs; ++i_simd) {
452 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf) {
456 DPRINTF(GPUSync,
"Checking WF[%d][%d]\n", i_simd, i_wf);
458 DPRINTF(GPUSync,
"wf->barrier_id = %d, _barrier_id = %d\n",
461 DPRINTF(GPUSync,
"wf->barrier_cnt %d, bcnt = %d\n",
470 DPRINTF(GPUSync,
"WF[%d][%d] at barrier, increment ccnt to "
471 "%d\n", i_simd, i_wf, ccnt);
476 DPRINTF(GPUSync,
"CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
477 cu_id, ccnt, bslots);
479 return ccnt == bslots;
501 if (!curWaveIDQueue.empty()) {
502 for (
auto it : curWaveIDQueue) {
505 if (cur_wave.
simdId == simdId &&
556 "No support for multiple Global Memory Pipelines exists!!!");
564 "No support for multiple Local Memory Pipelines exists!!!");
592 readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
635 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
645 DPRINTF(GPUSync,
"CU%d: WF[%d][%d]: barrier_cnt = %d\n",
649 if (gpuDynInst->useContinuation) {
650 assert(!gpuDynInst->isNoScope());
651 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
660 if (gpuDynInst->useContinuation) {
661 assert(!gpuDynInst->isNoScope());
662 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
676 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
688 int len = retries.size();
692 for (
int i = 0;
i <
len; ++
i) {
695 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
696 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
702 if (!sendTimingReq(pkt)) {
703 DPRINTF(GPUMem,
"failed again!\n");
706 DPRINTF(GPUMem,
"successful!\n");
715 computeUnit->fetchStage.processFetchReturn(pkt);
723 int len = retries.size();
727 for (
int i = 0;
i <
len; ++
i) {
730 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
731 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
733 if (!sendTimingReq(pkt)) {
734 DPRINTF(GPUFetch,
"failed again!\n");
737 DPRINTF(GPUFetch,
"successful!\n");
763 }
else if (pkt->
isRead()) {
766 fatal(
"pkt is not a read nor a write\n");
780 if ((vaddr + size - 1) % 64 < vaddr % 64) {
781 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
782 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
789 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
790 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
800 TheISA::GpuTLB::TranslationState *translation_state =
801 new TheISA::GpuTLB::TranslationState(TLB_mode,
shader->
gpuTc,
false,
807 tlbPort[tlbPort_index]->sendFunctional(pkt);
810 int hit_level = translation_state->hitLevel;
811 assert(hit_level != -1);
819 delete sender_state->
saved;
825 uint8_t *tmpData = pkt->
getPtr<uint8_t>();
833 pkt =
new Packet(oldPkt->req, oldPkt->cmd);
842 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(index);
843 gpuDynInst->tlbHitLevel[
index] = hit_level;
851 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data "
852 "scheduled\n",
cu_id, gpuDynInst->simdId,
853 gpuDynInst->wfSlotId, index, pkt->
req->
getPaddr());
856 }
else if (
tlbPort[tlbPort_index]->isStalled()) {
857 assert(
tlbPort[tlbPort_index]->retries.size() > 0);
859 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
860 "failed!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
863 tlbPort[tlbPort_index]->retries.push_back(pkt);
864 }
else if (!
tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
869 tlbPort[tlbPort_index]->stallPort();
871 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
872 "failed!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
875 tlbPort[tlbPort_index]->retries.push_back(pkt);
878 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
879 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
885 gpuDynInst->statusBitVector &= (~(1ll <<
index));
892 pkt->
senderState =
new TheISA::GpuTLB::TranslationState(TLB_mode,
895 tlbPort[tlbPort_index]->sendFunctional(pkt);
905 memPort[0]->sendFunctional(new_pkt);
907 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
908 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
912 TheISA::GpuTLB::TranslationState *sender_state =
915 delete sender_state->tlbEntry;
934 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
935 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
945 assert(gpuDynInst->isGlobalSeg());
959 gpuDynInst->setRequestFlags(req, kernelLaunch);
978 return "ComputeUnit memory response event";
992 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
993 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
994 pkt->req->getPaddr(), dataPort->index);
996 Addr paddr = pkt->req->getPaddr();
999 int index = gpuDynInst->memStatusVector[paddr].back();
1001 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1002 pkt->req->getPaddr(),
index);
1004 gpuDynInst->memStatusVector[paddr].pop_back();
1005 gpuDynInst->pAddr = pkt->req->getPaddr();
1007 if (pkt->isRead() || pkt->isWrite()) {
1010 gpuDynInst->statusBitVector &= (~(1
ULL <<
index));
1012 assert(gpuDynInst->statusVector[index] > 0);
1013 gpuDynInst->statusVector[
index]--;
1015 if (!gpuDynInst->statusVector[index])
1016 gpuDynInst->statusBitVector &= (~(1
ULL <<
index));
1019 DPRINTF(GPUMem,
"bitvector is now %#x\n",
1020 gpuDynInst->statusBitVector);
1022 if (gpuDynInst->statusBitVector ==
VectorMask(0)) {
1023 auto iter = gpuDynInst->memStatusVector.begin();
1024 auto end = gpuDynInst->memStatusVector.end();
1026 while (iter != end) {
1027 assert(iter->second.empty());
1031 gpuDynInst->memStatusVector.clear();
1034 gpuDynInst->statusVector.clear();
1038 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1039 compute_unit->
cu_id, gpuDynInst->simdId,
1040 gpuDynInst->wfSlotId);
1046 if (gpuDynInst->useContinuation) {
1047 assert(!gpuDynInst->isNoScope());
1048 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1056 if (gpuDynInst->useContinuation) {
1057 assert(!gpuDynInst->isNoScope());
1058 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1063 delete pkt->senderState;
1069 ComputeUnitParams::create()
1079 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1083 computeUnit->tlbCycles +=
curTick();
1086 TheISA::GpuTLB::TranslationState *translation_state =
1090 if (!translation_state->tlbEntry->valid) {
1095 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1098 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1102 assert(translation_state->tlbEntry->valid);
1105 int hit_level = translation_state->hitLevel;
1106 computeUnit->hitsPerTLBLevel[hit_level]++;
1108 delete translation_state->tlbEntry;
1109 assert(!translation_state->ports.size());
1115 delete translation_state;
1124 gpuDynInst->memStatusVector[line].push_back(mp_index);
1125 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1136 panic(
"unsupported response to request conversion %s\n",
1140 if (computeUnit->prefetchDepth) {
1141 int simdId = gpuDynInst->simdId;
1142 int wfSlotId = gpuDynInst->wfSlotId;
1145 switch(computeUnit->prefetchType) {
1147 last = computeUnit->lastVaddrCU[mp_index];
1149 case Enums::PF_PHASE:
1150 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1153 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1158 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1159 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1165 DPRINTF(GPUPrefetch,
"Stride is %d\n", stride);
1167 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1168 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1169 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1171 stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1172 computeUnit->prefetchStride: stride;
1174 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1175 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1177 DPRINTF(GPUPrefetch,
"Prefetching from %#x:", vaddr);
1180 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1181 DPRINTF(GPUPrefetch,
"%d * %d: %#x\n",
pf, stride,
1190 computeUnit->masterId(),
1195 prefetch_pkt->dataStatic(&foo);
1198 prefetch_pkt->senderState =
1199 new TheISA::GpuTLB::TranslationState(TLB_mode,
1200 computeUnit->shader->gpuTc,
1204 sendFunctional(prefetch_pkt);
1207 TheISA::GpuTLB::TranslationState *tlb_state =
1208 safe_cast<TheISA::GpuTLB::TranslationState*>(
1209 prefetch_pkt->senderState);
1212 delete tlb_state->tlbEntry;
1214 delete prefetch_pkt->req;
1215 delete prefetch_pkt;
1237 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1238 computeUnit->cu_id, gpuDynInst->simdId,
1239 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->
getPaddr());
1241 computeUnit->schedule(mem_req_event,
curTick() +
1242 computeUnit->req_tick_latency);
1250 return "ComputeUnit memory request event";
1260 if (!(dataPort->sendTimingReq(pkt))) {
1261 dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst));
1264 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1265 compute_unit->cu_id, gpuDynInst->simdId,
1266 gpuDynInst->wfSlotId, dataPort->index,
1267 pkt->req->getPaddr());
1270 "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
1271 compute_unit->cu_id, gpuDynInst->simdId,
1272 gpuDynInst->wfSlotId, dataPort->index,
1273 pkt->req->getPaddr());
1286 int len = retries.size();
1288 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1289 computeUnit->cu_id, len);
1292 assert(isStalled());
1297 for (
int i = 0;
i <
len; ++
i) {
1300 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1302 if (!sendTimingReq(pkt)) {
1305 DPRINTF(GPUTLB,
": failed again\n");
1308 DPRINTF(GPUTLB,
": successful\n");
1309 retries.pop_front();
1318 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1324 TheISA::GpuTLB::TranslationState *translation_state =
1327 bool success = translation_state->tlbEntry->valid;
1328 delete translation_state->tlbEntry;
1329 assert(!translation_state->ports.size());
1331 delete translation_state;
1348 computeUnit->fetchStage.fetch(pkt, wavefront);
1350 if (wavefront->dropFetch) {
1351 assert(wavefront->instructionBuffer.empty());
1352 wavefront->dropFetch =
false;
1355 wavefront->pendingFetch = 0;
1371 int len = retries.size();
1372 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1375 assert(isStalled());
1381 for (
int i = 0;
i <
len; ++
i) {
1384 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1386 if (!sendTimingReq(pkt)) {
1388 DPRINTF(GPUTLB,
": failed again\n");
1391 DPRINTF(GPUTLB,
": successful\n");
1392 retries.pop_front();
1404 .
desc(
"Number of vector ALU insts issued.")
1407 .
name(
name() +
".valu_insts_per_wf")
1408 .
desc(
"The avg. number of vector ALU insts issued per-wavefront.")
1412 .
desc(
"Number of scalar ALU insts issued.")
1415 .
name(
name() +
".salu_insts_per_wf")
1416 .
desc(
"The avg. number of scalar ALU insts issued per-wavefront.")
1419 .
name(
name() +
".inst_cycles_valu")
1420 .
desc(
"Number of cycles needed to execute VALU insts.")
1423 .
name(
name() +
".inst_cycles_salu")
1424 .
desc(
"Number of cycles needed to execute SALU insts.")
1427 .
name(
name() +
".thread_cycles_valu")
1428 .
desc(
"Number of thread cycles used to execute vector ALU ops. "
1429 "Similar to instCyclesVALU but multiplied by the number of "
1433 .
name(
name() +
".valu_utilization")
1434 .
desc(
"Percentage of active vector ALU threads in a wave.")
1437 .
name(
name() +
".lds_no_flat_insts")
1438 .
desc(
"Number of LDS insts issued, not including FLAT "
1439 "accesses that resolve to LDS.")
1442 .
name(
name() +
".lds_no_flat_insts_per_wf")
1443 .
desc(
"The avg. number of LDS insts (not including FLAT "
1444 "accesses that resolve to LDS) per-wavefront.")
1448 .
desc(
"The number of FLAT insts that resolve to vmem issued.")
1451 .
name(
name() +
".flat_vmem_insts_per_wf")
1452 .
desc(
"The average number of FLAT insts that resolve to vmem "
1453 "issued per-wavefront.")
1457 .
desc(
"The number of FLAT insts that resolve to LDS issued.")
1460 .
name(
name() +
".flat_lds_insts_per_wf")
1461 .
desc(
"The average number of FLAT insts that resolve to LDS "
1462 "issued per-wavefront.")
1465 .
name(
name() +
".vector_mem_writes")
1466 .
desc(
"Number of vector mem write insts (excluding FLAT insts).")
1469 .
name(
name() +
".vector_mem_writes_per_wf")
1470 .
desc(
"The average number of vector mem write insts "
1471 "(excluding FLAT insts) per-wavefront.")
1474 .
name(
name() +
".vector_mem_reads")
1475 .
desc(
"Number of vector mem read insts (excluding FLAT insts).")
1478 .
name(
name() +
".vector_mem_reads_per_wf")
1479 .
desc(
"The avg. number of vector mem read insts (excluding "
1480 "FLAT insts) per-wavefront.")
1483 .
name(
name() +
".scalar_mem_writes")
1484 .
desc(
"Number of scalar mem write insts.")
1487 .
name(
name() +
".scalar_mem_writes_per_wf")
1488 .
desc(
"The average number of scalar mem write insts per-wavefront.")
1491 .
name(
name() +
".scalar_mem_reads")
1492 .
desc(
"Number of scalar mem read insts.")
1495 .
name(
name() +
".scalar_mem_reads_per_wf")
1496 .
desc(
"The average number of scalar mem read insts per-wavefront.")
1512 .
desc(
"total number of cycles for all uncoalesced requests")
1517 .
desc(
"number of uncoalesced requests")
1521 .
name(
name() +
".avg_translation_latency")
1522 .
desc(
"Avg. translation latency for data translations")
1529 .
name(
name() +
".TLB_hits_distribution")
1530 .
desc(
"TLB hits distribution (0 for page table, x for Lx-TLB")
1534 for (
int i = 0;
i < 4; ++
i) {
1544 .
desc(
"Instruction Execution Rate: Number of executed vector "
1545 "instructions per cycle")
1550 .
name(
name() +
".lds_bank_conflicts")
1551 .
desc(
"Number of bank conflicts per LDS memory packet")
1555 .
name(
name() +
".lds_bank_access_cnt")
1556 .
desc(
"Total number of LDS bank accesses")
1564 .
name(
name() +
".page_divergence_dist")
1565 .
desc(
"pages touched per wf (over all mem. instr.)")
1570 .
name(
name() +
".warp_execution_dist")
1571 .
desc(
"number of lanes active per instruction (oval all instructions)")
1576 .
name(
name() +
".gmem_lanes_execution_dist")
1577 .
desc(
"number of active lanes per global memory instruction")
1582 .
name(
name() +
".lmem_lanes_execution_dist")
1583 .
desc(
"number of active lanes per local memory instruction")
1587 .
name(
name() +
".num_instr_executed")
1588 .
desc(
"number of instructions executed")
1592 .
name(
name() +
".num_vec_ops_executed")
1593 .
desc(
"number of vec ops executed (e.g. WF size/inst)")
1597 .
name(
name() +
".num_total_cycles")
1598 .
desc(
"number of cycles the CU ran for")
1603 .
desc(
"Instructions per cycle (this CU only)")
1608 .
desc(
"Vector Operations per cycle (this CU only)")
1612 .
name(
name() +
".num_alu_insts_executed")
1613 .
desc(
"Number of dynamic non-GM memory insts executed")
1617 .
name(
name() +
".wg_blocked_due_lds_alloc")
1618 .
desc(
"Workgroup blocked due to LDS capacity")
1625 .
name(
name() +
".times_wg_blocked_due_vgpr_alloc")
1626 .
desc(
"Number of times WGs are blocked due to VGPR allocation per SIMD")
1630 .
name(
name() +
".global_mem_instr_cnt")
1631 .
desc(
"dynamic global memory instructions count")
1635 .
name(
name() +
".local_mem_instr_cnt")
1636 .
desc(
"dynamic local memory intruction count")
1643 .
name(
name() +
".num_completed_wfs")
1644 .
desc(
"number of completed wavefronts")
1649 .
desc(
"number of compare and swap operations")
1653 .
name(
name() +
".num_failed_CAS_ops")
1654 .
desc(
"number of compare and swap operations that failed")
1671 if (gpuDynInst->isScalar()) {
1672 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1675 }
else if (gpuDynInst->isLoad()) {
1677 }
else if (gpuDynInst->isStore()) {
1681 if (gpuDynInst->isALU()) {
1685 }
else if (gpuDynInst->isFlat()) {
1686 if (gpuDynInst->isLocalMem()) {
1691 }
else if (gpuDynInst->isLocalMem()) {
1693 }
else if (gpuDynInst->isLoad()) {
1695 }
else if (gpuDynInst->isStore()) {
1715 if (computeUnit->countPages) {
1716 std::ostream *page_stat_file =
1719 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
1722 for (
auto iter : computeUnit->pageAccesses) {
1723 *page_stat_file << std::hex << iter.first <<
",";
1724 *page_stat_file << std::dec << iter.second.first <<
",";
1725 *page_stat_file << std::dec << iter.second.second << std::endl;
1739 bool glbMemBusRdy =
true;
1743 bool locMemBusRdy =
true;
1782 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
1822 fatal_if(!senderState,
"did not get the right sort of sender state");
1830 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
1844 fatal_if(!sender_state,
"packet without a valid sender state");
1849 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
1853 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
1854 computeUnit->cu_id, gpuDynInst->simdId,
1855 gpuDynInst->wfSlotId);
1863 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
1864 computeUnit->cu_id, gpuDynInst->simdId,
1868 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
1869 computeUnit->cu_id, gpuDynInst->simdId,
1884 auto queueSize = retries.size();
1886 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
1887 computeUnit->cu_id, queueSize);
1890 "why was there a recvReqRetry() with no pending reqs?");
1892 "recvReqRetry() happened when the port was not stalled");
1896 while (!retries.empty()) {
1899 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
1904 DPRINTF(GPUPort,
": LDS send failed again\n");
1907 DPRINTF(GPUTLB,
": LDS send successful\n");
uint32_t numVecRegsPerSimd
void updatePageDivergenceDist(Addr addr)
Stats::Formula tlbLatency
RubyTester::SenderState SenderState
const char * description() const
Return a C string describing the event.
GPUDynInstPtr _gpuDynInst
Tick ticks(int numCycles) const
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Stats::Scalar flatLDSInsts
std::vector< bool > vectorAluInstAvail
void handleResponse(GPUDynInstPtr gpuDynInst)
this method handles responses sent to this GM pipeline by the CU.
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation...
const Regs::Info & regInfo(Addr daddr)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch=true, RequestPtr req=nullptr)
uint32_t numCyclesPerLoadTransfer
const std::string & name()
static const int MAX_REGS_FOR_NON_VEC_MEM_INST
std::map< unsigned, waveQueue > xactCasLoadMap
void init(ComputeUnit *cu)
std::vector< std::vector< std::pair< Wavefront *, WAVE_STATUS > > > waveStatusList
void init(ComputeUnit *cu)
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
virtual void recvReqRetry()
Called by the slave port if sendTimingReq was called on this master port (causing recvTimingReq to be...
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState...
void fillKernelState(Wavefront *w, NDRange *ndr)
Stats::Vector hitsPerTLBLevel
Stats::Scalar dynamicGMemInstrCnt
ScheduleStage scheduleStage
Stats::Formula flatLDSInstsPerWF
Stats::Distribution controlFlowDivergenceDist
Bitfield< 21, 20 > stride
std::vector< std::vector< Wavefront * > > readyList
GPUDynInstPtr _gpuDynInst
Stats::Scalar vectorMemWrites
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
bool hasSize() const
Accessor for size.
void regStats()
Register statistics for this object.
GpuDispatcher * dispatcher
virtual Process * getProcessPtr()=0
CUExitCallback * cuExitCallback
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
const char * description() const
Return a C string describing the event.
Addr getPC() const
Accessor function for pc.
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the slave port by calling its corresponding receive function...
void init(ComputeUnit *cu)
std::vector< DTLBPort * > tlbPort
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
std::vector< std::vector< Wavefront * > > wfList
this represents a slice of the overall LDS, intended to be associated with an individual workgroup ...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the slave port.
Stats::Scalar dynamicLMemInstrCnt
SenderState is information carried along with the packet throughout the TLB hierarchy.
Stats::Formula numALUInstsExecuted
T * getPtr()
get a pointer to the data ptr.
GPUDynInstPtr getMemInst() const
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
GPUStaticInst * kernelLaunchInst
Stats::Scalar numInstrExecuted
Derived & init(size_type size)
Set this vector to have the given size.
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Stats::Distribution ldsBankConflictDist
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
SenderState is information carried along with the packet throughout the TLB hierarchy.
std::vector< WaitClass > vrfToLocalMemPipeBus
Stats::Formula vectorMemWritesPerWF
Stats::Scalar wgBlockedDueLdsAllocation
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
std::vector< WaitClass > aluPipe
uint32_t numCyclesPerStoreTransfer
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, NDRange *ndr)
ComputeUnit(const Params *p)
GlobalMemPipeline globalMemoryPipe
uint32_t coalescerToVrfBusWidth
Stats::Formula vALUUtilization
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Formula scalarMemWritesPerWF
Stats::Scalar numTimesWgBlockedDueVgprAlloc
std::vector< uint32_t > workItemId[3]
Tick curTick()
The current simulated tick.
std::deque< GPUDynInstPtr > instructionBuffer
Stats::Distribution execRateDist
Stats::Formula vectorMemReadsPerWF
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
std::vector< std::pair< uint32_t, uint32_t > > regIdxVec
std::string csprintf(const char *format, const Args &...args)
bool isGMLdRespFIFOWrRdy() const
void notifyWgCompl(Wavefront *w)
bool isLMRespFIFOWrRdy() const
std::vector< uint32_t > workItemFlatId
Stats::Distribution pageDivergenceDist
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
bool translate(Addr vaddr, Addr &paddr)
Translate function.
Stats::Scalar tlbRequests
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the slave port.
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
std::vector< WaitClass > vrfToGlobalMemPipeBus
void updateInstStats(GPUDynInstPtr gpuDynInst)
std::vector< int > barCnt
Stats::Scalar flatVMemInsts
virtual void recvReqRetry()
Called by the slave port if sendTimingReq was called on this master port (causing recvTimingReq to be...
const RequestPtr req
A pointer to the original request.
std::vector< DataPort * > memPort
The memory port for SIMD data accesses.
void registerExitCallback(Callback *callback)
Register an exit callback.
std::vector< std::vector< Addr > > lastVaddrSimd
int getAsid() const
Accessor function for asid.
bool isPowerOf2(const T &n)
uint32_t vrfToCoalescerBusWidth
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void computeActualWgSz(NDRange *ndr)
T roundDown(const T &val, const U &align)
void StartWorkgroup(NDRange *ndr)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Stats::Formula sALUInstsPerWF
void init(ComputeUnit *cu)
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Stats::Scalar scalarMemWrites
Stats::Scalar scalarMemReads
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Packet::SenderState * saved
#define ULL(N)
uint64_t constant
Stats::Scalar ldsNoFlatInsts
std::vector< std::pair< Wavefront *, DISPATCH_STATUS > > dispatchList
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result))
send a general request to the LDS make sure to look at the return value here as your request might be...
bool cedeSIMD(int simdId, int wfSlotId)
uint8_t args[KER_ARGS_LENGTH]
Stats::Scalar instCyclesVALU
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the slave port.
Stats::Scalar completedWfs
static const int NumArgumentRegs M5_VAR_USED
Flags getFlags()
Accessor for flags.
MasterID masterId() const
Accesssor for the requestor id.
Stats::Formula scalarMemReadsPerWF
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Stats::Formula vALUInstsPerWF
bool fixupStackFault(Addr vaddr)
Attempt to fix up a fault at vaddr by allocating a page on the stack.
Stats::Distribution activeLanesPerGMemInstrDist
virtual const std::string name() const
Declarations of a non-full system Page Table.
ComputeUnit * computeUnit
void init(uint64_t *_tcnt, uint32_t _numStages=0)
SenderState is information carried along with the packet, esp.
Stats::Scalar numVecOpsExecuted
std::vector< VectorRegisterFile * > vrf
SenderState * senderState
This packet's sender state.
uint32_t spillSizePerItem
MemCmd cmd
The command field of the packet.
void start(uint64_t _wfDynId, uint64_t _base_ptr)
void init(ComputeUnit *cu)
The MemObject class extends the ClockedObject with accessor functions to get its master and slave por...
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
void setVirt(int asid, Addr vaddr, unsigned size, Flags flags, MasterID mid, Addr pc)
Set up a virtual (e.g., CPU) request in a previously allocated Request object.
Stats::Scalar numFailedCASOps
T divCeil(const T &a, const U &b)
int ReadyWorkgroup(NDRange *ndr)
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
void init(ComputeUnit *cu)
const std::string & toString() const
Return the string to a cmd given by idx.
std::map< Addr, int > pagesTouched
void schedule(Event &event, Tick when)
Stats::Scalar instCyclesSALU
virtual void process()
virtual process function that is invoked when the callback queue is executed.
virtual void recvReqRetry()
Called by the slave port if sendTimingReq was called on this master port (causing recvTimingReq to be...
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Stats::Formula flatVMemInstsPerWF
bool isSimdDone(uint32_t) const
std::vector< uint8_t > statusVec
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
std::vector< uint64_t > lastExecCycle
The request should be marked with KERNEL.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the slave port.
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
fatal_if(p->js_features.size() > 16,"Too many job slot feature registers specified (%i)\n", p->js_features.size())
int impl_kern_boundary_sync
Stats::Scalar ldsBankAccesses
Stats::Scalar totalCycles
LDSPort * ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > timestampVec
Stats::Scalar vectorMemReads
void setPaddr(Addr paddr)
Set just the physical address.
std::vector< Addr > lastVaddrCU
void setFlags(Flags flags)
Note that unlike other accessors, this function sets specific flags (ORs them in); it does not assign...
void setParent(ComputeUnit *x_parent)
set the parent and name based on the parent
Stats::Formula ldsNoFlatInstsPerWF
void regStats() override
Register statistics for this object.
uint64_t getAndIncSeqNum()
Stats::Scalar threadCyclesVALU
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
bool isGMStRespFIFOWrRdy() const
bool hasPaddr() const
Accessor for paddr.
std::vector< int > vectorRegsReserved
ProbePointArg< PacketInfo > Packet
Packet probe point.
virtual void recvReqRetry()
Called by the slave port if sendTimingReq was called on this master port (causing recvTimingReq to be...
ScoreboardCheckStage scoreboardCheckStage