gem5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its contributors
18  * may be used to endorse or promote products derived from this software
19  * without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * Author: John Kalamatianos, Anthony Gutierrez
34  */
36 
37 #include <limits>
38 
39 #include "base/output.hh"
40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUExec.hh"
42 #include "debug/GPUFetch.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUPort.hh"
45 #include "debug/GPUPrefetch.hh"
46 #include "debug/GPUSync.hh"
47 #include "debug/GPUTLB.hh"
51 #include "gpu-compute/ndrange.hh"
52 #include "gpu-compute/shader.hh"
55 #include "gpu-compute/wavefront.hh"
56 #include "mem/page_table.hh"
57 #include "sim/process.hh"
58 
59 ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
60  scoreboardCheckStage(p), scheduleStage(p), execStage(p),
61  globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
62  cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
63  spBypassPipeLength(p->spbypass_pipe_length),
64  dpBypassPipeLength(p->dpbypass_pipe_length),
65  issuePeriod(p->issue_period),
66  numGlbMemUnits(p->num_global_mem_pipes),
67  numLocMemUnits(p->num_shared_mem_pipes),
68  perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
69  prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
70  xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
71  functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
72  countPages(p->countPages), barrier_id(0),
73  vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
74  coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
75  req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
76  resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
77  _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
78  lds(*p->localDataStore), _cacheLineSize(p->system->cacheLineSize()),
79  globalSeqNum(0), wavefrontSize(p->wfSize),
80  kernelLaunchInst(new KernelLaunchStaticInst())
81 {
91  fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
92  p->wfSize <= 0,
93  "WF size is larger than the host can support");
95  "Wavefront size should be a power of 2");
96  // calculate how many cycles a vector load or store will need to transfer
97  // its data over the corresponding buses
99  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
100  (double)vrfToCoalescerBusWidth);
101 
102  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
104 
105  lastVaddrWF.resize(numSIMDs);
106  wfList.resize(numSIMDs);
107 
108  for (int j = 0; j < numSIMDs; ++j) {
109  lastVaddrWF[j].resize(p->n_wf);
110 
111  for (int i = 0; i < p->n_wf; ++i) {
112  lastVaddrWF[j][i].resize(wfSize());
113 
114  wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
115  wfList[j][i]->setParent(this);
116 
117  for (int k = 0; k < wfSize(); ++k) {
118  lastVaddrWF[j][i][k] = 0;
119  }
120  }
121  }
122 
123  lastVaddrSimd.resize(numSIMDs);
124 
125  for (int i = 0; i < numSIMDs; ++i) {
126  lastVaddrSimd[i].resize(wfSize(), 0);
127  }
128 
129  lastVaddrCU.resize(wfSize());
130 
131  lds.setParent(this);
132 
133  if (p->execPolicy == "OLDEST-FIRST") {
135  } else if (p->execPolicy == "ROUND-ROBIN") {
137  } else {
138  fatal("Invalid WF execution policy (CU)\n");
139  }
140 
141  memPort.resize(wfSize());
142 
143  // resize the tlbPort vectorArray
144  int tlbPort_width = perLaneTLB ? wfSize() : 1;
145  tlbPort.resize(tlbPort_width);
146 
147  cuExitCallback = new CUExitCallback(this);
149 
150  xactCasLoadMap.clear();
151  lastExecCycle.resize(numSIMDs, 0);
152 
153  for (int i = 0; i < vrf.size(); ++i) {
154  vrf[i]->setParent(this);
155  }
156 
157  numVecRegsPerSimd = vrf[0]->numRegs();
158 }
159 
161 {
162  // Delete wavefront slots
163  for (int j = 0; j < numSIMDs; ++j) {
164  for (int i = 0; i < shader->n_wf; ++i) {
165  delete wfList[j][i];
166  }
167  lastVaddrSimd[j].clear();
168  }
169  lastVaddrCU.clear();
170  readyList.clear();
171  waveStatusList.clear();
172  dispatchList.clear();
173  vectorAluInstAvail.clear();
174  delete cuExitCallback;
175  delete ldsPort;
176 }
177 
178 void
180 {
181  w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
182 
183  w->workGroupSz[0] = ndr->q.wgSize[0];
184  w->workGroupSz[1] = ndr->q.wgSize[1];
185  w->workGroupSz[2] = ndr->q.wgSize[2];
186  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
187  w->gridSz[0] = ndr->q.gdSize[0];
188  w->gridSz[1] = ndr->q.gdSize[1];
189  w->gridSz[2] = ndr->q.gdSize[2];
190  w->kernelArgs = ndr->q.args;
191  w->privSizePerItem = ndr->q.privMemPerItem;
193  w->roBase = ndr->q.roMemStart;
194  w->roSize = ndr->q.roMemTotal;
195  w->computeActualWgSz(ndr);
196 }
197 
198 void
200 
201  if (!timestampVec.empty()) {
202  uint32_t vecSize = timestampVec.size();
203  uint32_t i = 0;
204  while (i < vecSize) {
205  if (timestampVec[i] <= shader->tick_cnt) {
207  vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
208  statusVec[i]);
209  timestampVec.erase(timestampVec.begin() + i);
210  regIdxVec.erase(regIdxVec.begin() + i);
211  statusVec.erase(statusVec.begin() + i);
212  --vecSize;
213  --i;
214  }
215  ++i;
216  }
217  }
218 
219  for (int i = 0; i< numSIMDs; ++i) {
220  vrf[i]->updateEvents();
221  }
222 }
223 
224 
225 void
227  NDRange *ndr)
228 {
229  static int _n_wave = 0;
230 
231  VectorMask init_mask;
232  init_mask.reset();
233 
234  for (int k = 0; k < wfSize(); ++k) {
235  if (k + waveId * wfSize() < w->actualWgSzTotal)
236  init_mask[k] = 1;
237  }
238 
239  w->kernId = ndr->dispatchId;
240  w->wfId = waveId;
241  w->initMask = init_mask.to_ullong();
242 
243  for (int k = 0; k < wfSize(); ++k) {
244  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
245  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
246  w->actualWgSz[1];
247  w->workItemId[2][k] = (k + waveId * wfSize()) /
248  (w->actualWgSz[0] * w->actualWgSz[1]);
249 
250  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
251  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
252  w->workItemId[0][k];
253  }
254 
255  w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
256 
257  w->barCnt.resize(wfSize(), 0);
258 
259  w->maxBarCnt = 0;
260  w->oldBarrierCnt = 0;
261  w->barrierCnt = 0;
262 
263  w->privBase = ndr->q.privMemStart;
264  ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
265 
266  w->spillBase = ndr->q.spillMemStart;
267  ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
268 
269  w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
270 
271  // WG state
272  w->wgId = ndr->globalWgId;
273  w->dispatchId = ndr->dispatchId;
274  w->workGroupId[0] = w->wgId % ndr->numWg[0];
275  w->workGroupId[1] = (w->wgId / ndr->numWg[0]) % ndr->numWg[1];
276  w->workGroupId[2] = w->wgId / (ndr->numWg[0] * ndr->numWg[1]);
277 
278  w->barrierId = barrier_id;
279  w->stalledAtBarrier = false;
280 
281  // set the wavefront context to have a pointer to this section of the LDS
282  w->ldsChunk = ldsChunk;
283 
284  int32_t refCount M5_VAR_USED =
286  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
287  cu_id, w->wgId, refCount);
288 
289  w->instructionBuffer.clear();
290 
291  if (w->pendingFetch)
292  w->dropFetch = true;
293 
294  // is this the last wavefront in the workgroup
295  // if set the spillWidth to be the remaining work-items
296  // so that the vector access is correct
297  if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) {
298  w->spillWidth = w->actualWgSzTotal - (waveId * wfSize());
299  } else {
300  w->spillWidth = wfSize();
301  }
302 
303  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
304  "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
305 
306  w->start(++_n_wave, ndr->q.code_ptr);
307 }
308 
309 void
311 {
312  // reserve the LDS capacity allocated to the work group
313  // disambiguated by the dispatch ID and workgroup ID, which should be
314  // globally unique
315  LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
316  ndr->q.ldsSize);
317 
318  // Send L1 cache acquire
319  // isKernel + isAcquire = Kernel Begin
321  GPUDynInstPtr gpuDynInst =
322  std::make_shared<GPUDynInst>(this, nullptr, kernelLaunchInst,
323  getAndIncSeqNum());
324 
325  gpuDynInst->useContinuation = false;
326  injectGlobalMemFence(gpuDynInst, true);
327  }
328 
329  // calculate the number of 32-bit vector registers required by wavefront
330  int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
331  int wave_id = 0;
332 
333  // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
334  for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
335  Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
336  // Check if this wavefront slot is available:
337  // It must be stopped and not waiting
338  // for a release to complete S_RETURNING
339  if (w->status == Wavefront::S_STOPPED) {
340  fillKernelState(w, ndr);
341  // if we have scheduled all work items then stop
342  // scheduling wavefronts
343  if (wave_id * wfSize() >= w->actualWgSzTotal)
344  break;
345 
346  // reserve vector registers for the scheduled wavefront
348  uint32_t normSize = 0;
349 
350  w->startVgprIndex = vrf[m % numSIMDs]->manager->
351  allocateRegion(vregDemand, &normSize);
352 
353  w->reservedVectorRegs = normSize;
355 
356  startWavefront(w, wave_id, ldsChunk, ndr);
357  ++wave_id;
358  }
359  }
360  ++barrier_id;
361 }
362 
363 int
365 {
366  // Get true size of workgroup (after clamping to grid size)
367  int trueWgSize[3];
368  int trueWgSizeTotal = 1;
369 
370  for (int d = 0; d < 3; ++d) {
371  trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
372  ndr->wgId[d] * ndr->q.wgSize[d]);
373 
374  trueWgSizeTotal *= trueWgSize[d];
375  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
376  }
377 
378  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
379 
380  // calculate the number of 32-bit vector registers required by each
381  // work item of the work group
382  int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
383  bool vregAvail = true;
384  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
385  int freeWfSlots = 0;
386  // check if the total number of VGPRs required by all WFs of the WG
387  // fit in the VRFs of all SIMD units
388  assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
389  int numMappedWfs = 0;
390  std::vector<int> numWfsPerSimd;
391  numWfsPerSimd.resize(numSIMDs, 0);
392  // find how many free WF slots we have across all SIMDs
393  for (int j = 0; j < shader->n_wf; ++j) {
394  for (int i = 0; i < numSIMDs; ++i) {
395  if (wfList[i][j]->status == Wavefront::S_STOPPED) {
396  // count the number of free WF slots
397  ++freeWfSlots;
398  if (numMappedWfs < numWfs) {
399  // count the WFs to be assigned per SIMD
400  numWfsPerSimd[i]++;
401  }
402  numMappedWfs++;
403  }
404  }
405  }
406 
407  // if there are enough free WF slots then find if there are enough
408  // free VGPRs per SIMD based on the WF->SIMD mapping
409  if (freeWfSlots >= numWfs) {
410  for (int j = 0; j < numSIMDs; ++j) {
411  // find if there are enough free VGPR regions in the SIMD's VRF
412  // to accommodate the WFs of the new WG that would be mapped to
413  // this SIMD unit
414  vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
415  vregDemandPerWI);
416 
417  // stop searching if there is at least one SIMD
418  // whose VRF does not have enough free VGPR pools.
419  // This is because a WG is scheduled only if ALL
420  // of its WFs can be scheduled
421  if (!vregAvail)
422  break;
423  }
424  }
425 
426  DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n",
427  freeWfSlots, vregAvail);
428 
429  if (!vregAvail) {
431  }
432 
433  // Return true if enough WF slots to submit workgroup and if there are
434  // enough VGPRs to schedule all WFs to their SIMD units
435  if (!lds.canReserve(ndr->q.ldsSize)) {
437  }
438 
439  // Return true if (a) there are enough free WF slots to submit
440  // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
441  // SIMD units and (c) if there is enough space in LDS
442  return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
443 }
444 
445 int
446 ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
447 {
448  DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
449  int ccnt = 0;
450 
451  for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
452  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
453  Wavefront *w = wfList[i_simd][i_wf];
454 
455  if (w->status == Wavefront::S_RUNNING) {
456  DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
457 
458  DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
459  w->barrierId, _barrier_id);
460 
461  DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
462  w->barrierCnt, bcnt);
463  }
464 
465  if (w->status == Wavefront::S_RUNNING &&
466  w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
467  !w->outstandingReqs) {
468  ++ccnt;
469 
470  DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
471  "%d\n", i_simd, i_wf, ccnt);
472  }
473  }
474  }
475 
476  DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
477  cu_id, ccnt, bslots);
478 
479  return ccnt == bslots;
480 }
481 
482 // Check if the current wavefront is blocked on additional resources.
483 bool
484 ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
485 {
486  bool cede = false;
487 
488  // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
489  // magic instructions will impact the scheduling of wavefronts
490  if (xact_cas_mode) {
491  /*
492  * When a wavefront calls xact_cas_ld, it adds itself to a per address
493  * queue. All per address queues are managed by the xactCasLoadMap.
494  *
495  * A wavefront is not blocked if: it is not in ANY per address queue or
496  * if it is at the head of a per address queue.
497  */
498  for (auto itMap : xactCasLoadMap) {
499  std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
500 
501  if (!curWaveIDQueue.empty()) {
502  for (auto it : curWaveIDQueue) {
503  waveIdentifier cur_wave = it;
504 
505  if (cur_wave.simdId == simdId &&
506  cur_wave.wfSlotId == wfSlotId) {
507  // 2 possibilities
508  // 1: this WF has a green light
509  // 2: another WF has a green light
510  waveIdentifier owner_wave = curWaveIDQueue.front();
511 
512  if (owner_wave.simdId != cur_wave.simdId ||
513  owner_wave.wfSlotId != cur_wave.wfSlotId) {
514  // possibility 2
515  cede = true;
516  break;
517  } else {
518  // possibility 1
519  break;
520  }
521  }
522  }
523  }
524  }
525  }
526 
527  return cede;
528 }
529 
530 // Execute one clock worth of work on the ComputeUnit.
531 void
533 {
534  updateEvents();
535  // Execute pipeline stages in reverse order to simulate
536  // the pipeline latency
539  execStage.exec();
542  fetchStage.exec();
543 
544  totalCycles++;
545 }
546 
547 void
549 {
550  // Initialize CU Bus models
553  nextGlbMemBus = 0;
554  nextLocMemBus = 0;
556  "No support for multiple Global Memory Pipelines exists!!!");
558  for (int j = 0; j < numGlbMemUnits; ++j) {
561  }
562 
564  "No support for multiple Local Memory Pipelines exists!!!");
566  for (int j = 0; j < numLocMemUnits; ++j) {
569  }
570  vectorRegsReserved.resize(numSIMDs, 0);
571  aluPipe.resize(numSIMDs);
572  wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
573 
574  for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
575  wfWait[i] = WaitClass();
576  wfWait[i].init(&shader->tick_cnt, shader->ticks(1));
577  }
578 
579  for (int i = 0; i < numSIMDs; ++i) {
580  aluPipe[i] = WaitClass();
581  aluPipe[i].init(&shader->tick_cnt, shader->ticks(1));
582  }
583 
584  // Setup space for call args
585  for (int j = 0; j < numSIMDs; ++j) {
586  for (int i = 0; i < shader->n_wf; ++i) {
587  wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
588  }
589  }
590 
591  // Initializing pipeline resources
592  readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
593  waveStatusList.resize(numSIMDs);
594 
595  for (int j = 0; j < numSIMDs; ++j) {
596  for (int i = 0; i < shader->n_wf; ++i) {
597  waveStatusList[j].push_back(
598  std::make_pair(wfList[j][i], BLOCKED));
599  }
600  }
601 
602  for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
603  dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
604  }
605 
606  fetchStage.init(this);
608  scheduleStage.init(this);
609  execStage.init(this);
610  globalMemoryPipe.init(this);
611  localMemoryPipe.init(this);
612  // initialize state for statistics calculation
613  vectorAluInstAvail.resize(numSIMDs, false);
614  shrMemInstAvail = 0;
615  glbMemInstAvail = 0;
616 }
617 
618 bool
620 {
621  // Ruby has completed the memory op. Schedule the mem_resp_event at the
622  // appropriate cycle to process the timing memory response
623  // This delay represents the pipeline delay
624  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
625  int index = sender_state->port_index;
626  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
627 
628  // Is the packet returned a Kernel End or Barrier
629  if (pkt->req->isKernel() && pkt->req->isRelease()) {
630  Wavefront *w =
631  computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
632 
633  // Check if we are waiting on Kernel End Release
634  if (w->status == Wavefront::S_RETURNING) {
635  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
636  computeUnit->cu_id, w->simdId, w->wfSlotId,
637  w->wfDynId, w->kernId);
638 
641  } else {
642  w->outstandingReqs--;
643  }
644 
645  DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
646  computeUnit->cu_id, gpuDynInst->simdId,
647  gpuDynInst->wfSlotId, w->barrierCnt);
648 
649  if (gpuDynInst->useContinuation) {
650  assert(!gpuDynInst->isNoScope());
651  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
652  gpuDynInst);
653  }
654 
655  delete pkt->senderState;
656  delete pkt->req;
657  delete pkt;
658  return true;
659  } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
660  if (gpuDynInst->useContinuation) {
661  assert(!gpuDynInst->isNoScope());
662  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
663  gpuDynInst);
664  }
665 
666  delete pkt->senderState;
667  delete pkt->req;
668  delete pkt;
669  return true;
670  }
671 
672  ComputeUnit::DataPort::MemRespEvent *mem_resp_event =
674  pkt);
675 
676  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
677  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
678  index, pkt->req->getPaddr());
679 
680  computeUnit->schedule(mem_resp_event,
682  return true;
683 }
684 
685 void
687 {
688  int len = retries.size();
689 
690  assert(len > 0);
691 
692  for (int i = 0; i < len; ++i) {
693  PacketPtr pkt = retries.front().first;
694  GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
695  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
696  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
697  pkt->req->getPaddr());
698 
702  if (!sendTimingReq(pkt)) {
703  DPRINTF(GPUMem, "failed again!\n");
704  break;
705  } else {
706  DPRINTF(GPUMem, "successful!\n");
707  retries.pop_front();
708  }
709  }
710 }
711 
712 bool
714 {
715  computeUnit->fetchStage.processFetchReturn(pkt);
716 
717  return true;
718 }
719 
720 void
722 {
723  int len = retries.size();
724 
725  assert(len > 0);
726 
727  for (int i = 0; i < len; ++i) {
728  PacketPtr pkt = retries.front().first;
729  Wavefront *wavefront M5_VAR_USED = retries.front().second;
730  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
731  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
732  pkt->req->getPaddr());
733  if (!sendTimingReq(pkt)) {
734  DPRINTF(GPUFetch, "failed again!\n");
735  break;
736  } else {
737  DPRINTF(GPUFetch, "successful!\n");
738  retries.pop_front();
739  }
740  }
741 }
742 
743 void
745 {
746  // There must be a way around this check to do the globalMemStart...
747  Addr tmp_vaddr = pkt->req->getVaddr();
748 
749  updatePageDivergenceDist(tmp_vaddr);
750 
751  pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(),
752  pkt->req->getFlags(), pkt->req->masterId(),
753  pkt->req->getPC());
754 
755  // figure out the type of the request to set read/write
756  BaseTLB::Mode TLB_mode;
757  assert(pkt->isRead() || pkt->isWrite());
758 
759  // Check write before read for atomic operations
760  // since atomic operations should use BaseTLB::Write
761  if (pkt->isWrite()){
762  TLB_mode = BaseTLB::Write;
763  } else if (pkt->isRead()) {
764  TLB_mode = BaseTLB::Read;
765  } else {
766  fatal("pkt is not a read nor a write\n");
767  }
768 
769  tlbCycles -= curTick();
770  ++tlbRequests;
771 
772  int tlbPort_index = perLaneTLB ? index : 0;
773 
774  if (shader->timingSim) {
775  if (debugSegFault) {
777  Addr vaddr = pkt->req->getVaddr();
778  unsigned size = pkt->getSize();
779 
780  if ((vaddr + size - 1) % 64 < vaddr % 64) {
781  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
782  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
783  }
784 
785  Addr paddr;
786 
787  if (!p->pTable->translate(vaddr, paddr)) {
788  if (!p->fixupStackFault(vaddr)) {
789  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
790  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
791  vaddr);
792  }
793  }
794  }
795 
796  // This is the SenderState needed upon return
797  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
798 
799  // This is the senderState needed by the TLB hierarchy to function
800  TheISA::GpuTLB::TranslationState *translation_state =
801  new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
802  pkt->senderState);
803 
804  pkt->senderState = translation_state;
805 
806  if (functionalTLB) {
807  tlbPort[tlbPort_index]->sendFunctional(pkt);
808 
809  // update the hitLevel distribution
810  int hit_level = translation_state->hitLevel;
811  assert(hit_level != -1);
812  hitsPerTLBLevel[hit_level]++;
813 
814  // New SenderState for the memory access
815  X86ISA::GpuTLB::TranslationState *sender_state =
817 
818  delete sender_state->tlbEntry;
819  delete sender_state->saved;
820  delete sender_state;
821 
822  assert(pkt->req->hasPaddr());
823  assert(pkt->req->hasSize());
824 
825  uint8_t *tmpData = pkt->getPtr<uint8_t>();
826 
827  // this is necessary because the GPU TLB receives packets instead
828  // of requests. when the translation is complete, all relevent
829  // fields in the request will be populated, but not in the packet.
830  // here we create the new packet so we can set the size, addr,
831  // and proper flags.
832  PacketPtr oldPkt = pkt;
833  pkt = new Packet(oldPkt->req, oldPkt->cmd);
834  delete oldPkt;
835  pkt->dataStatic(tmpData);
836 
837 
838  // New SenderState for the memory access
839  pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
840  index, nullptr);
841 
842  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
843  gpuDynInst->tlbHitLevel[index] = hit_level;
844 
845 
846  // translation is done. Schedule the mem_req_event at the
847  // appropriate cycle to send the timing memory request to ruby
848  ComputeUnit::DataPort::MemReqEvent *mem_req_event =
850 
851  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
852  "scheduled\n", cu_id, gpuDynInst->simdId,
853  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
854 
855  schedule(mem_req_event, curTick() + req_tick_latency);
856  } else if (tlbPort[tlbPort_index]->isStalled()) {
857  assert(tlbPort[tlbPort_index]->retries.size() > 0);
858 
859  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
860  "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
861  tmp_vaddr);
862 
863  tlbPort[tlbPort_index]->retries.push_back(pkt);
864  } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
865  // Stall the data port;
866  // No more packet will be issued till
867  // ruby indicates resources are freed by
868  // a recvReqRetry() call back on this port.
869  tlbPort[tlbPort_index]->stallPort();
870 
871  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
872  "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
873  tmp_vaddr);
874 
875  tlbPort[tlbPort_index]->retries.push_back(pkt);
876  } else {
877  DPRINTF(GPUTLB,
878  "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
879  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
880  }
881  } else {
882  if (pkt->cmd == MemCmd::MemFenceReq) {
883  gpuDynInst->statusBitVector = VectorMask(0);
884  } else {
885  gpuDynInst->statusBitVector &= (~(1ll << index));
886  }
887 
888  // New SenderState for the memory access
889  delete pkt->senderState;
890 
891  // Because it's atomic operation, only need TLB translation state
892  pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
893  shader->gpuTc);
894 
895  tlbPort[tlbPort_index]->sendFunctional(pkt);
896 
897  // the addr of the packet is not modified, so we need to create a new
898  // packet, or otherwise the memory access will have the old virtual
899  // address sent in the translation packet, instead of the physical
900  // address returned by the translation.
901  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
902  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
903 
904  // Translation is done. It is safe to send the packet to memory.
905  memPort[0]->sendFunctional(new_pkt);
906 
907  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
908  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
909  new_pkt->req->getPaddr());
910 
911  // safe_cast the senderState
912  TheISA::GpuTLB::TranslationState *sender_state =
913  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
914 
915  delete sender_state->tlbEntry;
916  delete new_pkt;
917  delete pkt->senderState;
918  delete pkt->req;
919  delete pkt;
920  }
921 }
922 
923 void
925 {
926  ComputeUnit::DataPort::MemReqEvent *mem_req_event =
928 
929 
930  // New SenderState for the memory access
931  pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
932  nullptr);
933 
934  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
935  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
936  pkt->req->getPaddr());
937 
938  schedule(mem_req_event, curTick() + req_tick_latency);
939 }
940 
941 void
942 ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
943  Request* req)
944 {
945  assert(gpuDynInst->isGlobalSeg());
946 
947  if (!req) {
948  req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId);
949  }
950  req->setPaddr(0);
951  if (kernelLaunch) {
953  }
954 
955  // for non-kernel MemFence operations, memorder flags are set depending
956  // on which type of request is currently being sent, so this
957  // should be set by the caller (e.g. if an inst has acq-rel
958  // semantics, it will send one acquire req an one release req)
959  gpuDynInst->setRequestFlags(req, kernelLaunch);
960 
961  // a mem fence must correspond to an acquire/release request
962  assert(req->isAcquire() || req->isRelease());
963 
964  // create packet
965  PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
966 
967  // set packet's sender state
968  pkt->senderState =
969  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
970 
971  // send the packet
972  sendSyncRequest(gpuDynInst, 0, pkt);
973 }
974 
975 const char*
977 {
978  return "ComputeUnit memory response event";
979 }
980 
981 void
983 {
984  DataPort::SenderState *sender_state =
985  safe_cast<DataPort::SenderState*>(pkt->senderState);
986 
987  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
988  ComputeUnit *compute_unit = dataPort->computeUnit;
989 
990  assert(gpuDynInst);
991 
992  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
993  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
994  pkt->req->getPaddr(), dataPort->index);
995 
996  Addr paddr = pkt->req->getPaddr();
997 
998  if (pkt->cmd != MemCmd::MemFenceResp) {
999  int index = gpuDynInst->memStatusVector[paddr].back();
1000 
1001  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1002  pkt->req->getPaddr(), index);
1003 
1004  gpuDynInst->memStatusVector[paddr].pop_back();
1005  gpuDynInst->pAddr = pkt->req->getPaddr();
1006 
1007  if (pkt->isRead() || pkt->isWrite()) {
1008 
1009  if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
1010  gpuDynInst->statusBitVector &= (~(1ULL << index));
1011  } else {
1012  assert(gpuDynInst->statusVector[index] > 0);
1013  gpuDynInst->statusVector[index]--;
1014 
1015  if (!gpuDynInst->statusVector[index])
1016  gpuDynInst->statusBitVector &= (~(1ULL << index));
1017  }
1018 
1019  DPRINTF(GPUMem, "bitvector is now %#x\n",
1020  gpuDynInst->statusBitVector);
1021 
1022  if (gpuDynInst->statusBitVector == VectorMask(0)) {
1023  auto iter = gpuDynInst->memStatusVector.begin();
1024  auto end = gpuDynInst->memStatusVector.end();
1025 
1026  while (iter != end) {
1027  assert(iter->second.empty());
1028  ++iter;
1029  }
1030 
1031  gpuDynInst->memStatusVector.clear();
1032 
1033  if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
1034  gpuDynInst->statusVector.clear();
1035 
1036  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1037 
1038  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1039  compute_unit->cu_id, gpuDynInst->simdId,
1040  gpuDynInst->wfSlotId);
1041 
1042  // after clearing the status vectors,
1043  // see if there is a continuation to perform
1044  // the continuation may generate more work for
1045  // this memory request
1046  if (gpuDynInst->useContinuation) {
1047  assert(!gpuDynInst->isNoScope());
1048  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1049  gpuDynInst);
1050  }
1051  }
1052  }
1053  } else {
1054  gpuDynInst->statusBitVector = VectorMask(0);
1055 
1056  if (gpuDynInst->useContinuation) {
1057  assert(!gpuDynInst->isNoScope());
1058  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1059  gpuDynInst);
1060  }
1061  }
1062 
1063  delete pkt->senderState;
1064  delete pkt->req;
1065  delete pkt;
1066 }
1067 
1068 ComputeUnit*
1069 ComputeUnitParams::create()
1070 {
1071  return new ComputeUnit(this);
1072 }
1073 
1074 bool
1076 {
1077  Addr line = pkt->req->getPaddr();
1078 
1079  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1080  pkt->req->getVaddr(), line);
1081 
1082  assert(pkt->senderState);
1083  computeUnit->tlbCycles += curTick();
1084 
1085  // pop off the TLB translation state
1086  TheISA::GpuTLB::TranslationState *translation_state =
1087  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1088 
1089  // no PageFaults are permitted for data accesses
1090  if (!translation_state->tlbEntry->valid) {
1091  DTLBPort::SenderState *sender_state =
1092  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1093 
1095  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1096  [sender_state->_gpuDynInst->wfSlotId];
1097 
1098  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1099  pkt->req->getVaddr());
1100  }
1101 
1102  assert(translation_state->tlbEntry->valid);
1103 
1104  // update the hitLevel distribution
1105  int hit_level = translation_state->hitLevel;
1106  computeUnit->hitsPerTLBLevel[hit_level]++;
1107 
1108  delete translation_state->tlbEntry;
1109  assert(!translation_state->ports.size());
1110  pkt->senderState = translation_state->saved;
1111 
1112  // for prefetch pkt
1113  BaseTLB::Mode TLB_mode = translation_state->tlbMode;
1114 
1115  delete translation_state;
1116 
1117  // use the original sender state to know how to close this transaction
1118  DTLBPort::SenderState *sender_state =
1120 
1121  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1122  int mp_index = sender_state->portIndex;
1123  Addr vaddr = pkt->req->getVaddr();
1124  gpuDynInst->memStatusVector[line].push_back(mp_index);
1125  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1126 
1127  MemCmd requestCmd;
1128 
1129  if (pkt->cmd == MemCmd::ReadResp) {
1130  requestCmd = MemCmd::ReadReq;
1131  } else if (pkt->cmd == MemCmd::WriteResp) {
1132  requestCmd = MemCmd::WriteReq;
1133  } else if (pkt->cmd == MemCmd::SwapResp) {
1134  requestCmd = MemCmd::SwapReq;
1135  } else {
1136  panic("unsupported response to request conversion %s\n",
1137  pkt->cmd.toString());
1138  }
1139 
1140  if (computeUnit->prefetchDepth) {
1141  int simdId = gpuDynInst->simdId;
1142  int wfSlotId = gpuDynInst->wfSlotId;
1143  Addr last = 0;
1144 
1145  switch(computeUnit->prefetchType) {
1146  case Enums::PF_CU:
1147  last = computeUnit->lastVaddrCU[mp_index];
1148  break;
1149  case Enums::PF_PHASE:
1150  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1151  break;
1152  case Enums::PF_WF:
1153  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1154  default:
1155  break;
1156  }
1157 
1158  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1159  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1160 
1161  int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
1163  : 0;
1164 
1165  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1166 
1167  computeUnit->lastVaddrCU[mp_index] = vaddr;
1168  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1169  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1170 
1171  stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1172  computeUnit->prefetchStride: stride;
1173 
1174  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1175  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1176 
1177  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1178 
1179  // Prefetch Next few pages atomically
1180  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1181  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1182  vaddr+stride*pf*TheISA::PageBytes);
1183 
1184  if (!stride)
1185  break;
1186 
1187  Request *prefetch_req = new Request(0, vaddr + stride * pf *
1188  TheISA::PageBytes,
1189  sizeof(uint8_t), 0,
1190  computeUnit->masterId(),
1191  0, 0, 0);
1192 
1193  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1194  uint8_t foo = 0;
1195  prefetch_pkt->dataStatic(&foo);
1196 
1197  // Because it's atomic operation, only need TLB translation state
1198  prefetch_pkt->senderState =
1199  new TheISA::GpuTLB::TranslationState(TLB_mode,
1200  computeUnit->shader->gpuTc,
1201  true);
1202 
1203  // Currently prefetches are zero-latency, hence the sendFunctional
1204  sendFunctional(prefetch_pkt);
1205 
1206  /* safe_cast the senderState */
1207  TheISA::GpuTLB::TranslationState *tlb_state =
1208  safe_cast<TheISA::GpuTLB::TranslationState*>(
1209  prefetch_pkt->senderState);
1210 
1211 
1212  delete tlb_state->tlbEntry;
1213  delete tlb_state;
1214  delete prefetch_pkt->req;
1215  delete prefetch_pkt;
1216  }
1217  }
1218 
1219  // First we must convert the response cmd back to a request cmd so that
1220  // the request can be sent through the cu's master port
1221  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1222  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1223  delete pkt->senderState;
1224  delete pkt;
1225 
1226  // New SenderState for the memory access
1227  new_pkt->senderState =
1228  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1229  nullptr);
1230 
1231  // translation is done. Schedule the mem_req_event at the appropriate
1232  // cycle to send the timing memory request to ruby
1233  ComputeUnit::DataPort::MemReqEvent *mem_req_event =
1234  new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index],
1235  new_pkt);
1236 
1237  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1238  computeUnit->cu_id, gpuDynInst->simdId,
1239  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1240 
1241  computeUnit->schedule(mem_req_event, curTick() +
1242  computeUnit->req_tick_latency);
1243 
1244  return true;
1245 }
1246 
1247 const char*
1249 {
1250  return "ComputeUnit memory request event";
1251 }
1252 
1253 void
1255 {
1256  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1257  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1258  ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit;
1259 
1260  if (!(dataPort->sendTimingReq(pkt))) {
1261  dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst));
1262 
1263  DPRINTF(GPUPort,
1264  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1265  compute_unit->cu_id, gpuDynInst->simdId,
1266  gpuDynInst->wfSlotId, dataPort->index,
1267  pkt->req->getPaddr());
1268  } else {
1269  DPRINTF(GPUPort,
1270  "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
1271  compute_unit->cu_id, gpuDynInst->simdId,
1272  gpuDynInst->wfSlotId, dataPort->index,
1273  pkt->req->getPaddr());
1274  }
1275 }
1276 
1277 /*
1278  * The initial translation request could have been rejected,
1279  * if <retries> queue is not Retry sending the translation
1280  * request. sendRetry() is called from the peer port whenever
1281  * a translation completes.
1282  */
1283 void
1285 {
1286  int len = retries.size();
1287 
1288  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1289  computeUnit->cu_id, len);
1290 
1291  assert(len > 0);
1292  assert(isStalled());
1293  // recvReqRetry is an indication that the resource on which this
1294  // port was stalling on is freed. So, remove the stall first
1295  unstallPort();
1296 
1297  for (int i = 0; i < len; ++i) {
1298  PacketPtr pkt = retries.front();
1299  Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1300  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1301 
1302  if (!sendTimingReq(pkt)) {
1303  // Stall port
1304  stallPort();
1305  DPRINTF(GPUTLB, ": failed again\n");
1306  break;
1307  } else {
1308  DPRINTF(GPUTLB, ": successful\n");
1309  retries.pop_front();
1310  }
1311  }
1312 }
1313 
1314 bool
1316 {
1317  Addr line M5_VAR_USED = pkt->req->getPaddr();
1318  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1319  computeUnit->cu_id, pkt->req->getVaddr(), line);
1320 
1321  assert(pkt->senderState);
1322 
1323  // pop off the TLB translation state
1324  TheISA::GpuTLB::TranslationState *translation_state =
1325  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1326 
1327  bool success = translation_state->tlbEntry->valid;
1328  delete translation_state->tlbEntry;
1329  assert(!translation_state->ports.size());
1330  pkt->senderState = translation_state->saved;
1331  delete translation_state;
1332 
1333  // use the original sender state to know how to close this transaction
1334  ITLBPort::SenderState *sender_state =
1336 
1337  // get the wavefront associated with this translation request
1338  Wavefront *wavefront = sender_state->wavefront;
1339  delete pkt->senderState;
1340 
1341  if (success) {
1342  // pkt is reused in fetch(), don't delete it here. However, we must
1343  // reset the command to be a request so that it can be sent through
1344  // the cu's master port
1345  assert(pkt->cmd == MemCmd::ReadResp);
1346  pkt->cmd = MemCmd::ReadReq;
1347 
1348  computeUnit->fetchStage.fetch(pkt, wavefront);
1349  } else {
1350  if (wavefront->dropFetch) {
1351  assert(wavefront->instructionBuffer.empty());
1352  wavefront->dropFetch = false;
1353  }
1354 
1355  wavefront->pendingFetch = 0;
1356  }
1357 
1358  return true;
1359 }
1360 
1361 /*
1362  * The initial translation request could have been rejected, if
1363  * <retries> queue is not empty. Retry sending the translation
1364  * request. sendRetry() is called from the peer port whenever
1365  * a translation completes.
1366  */
1367 void
1369 {
1370 
1371  int len = retries.size();
1372  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1373 
1374  assert(len > 0);
1375  assert(isStalled());
1376 
1377  // recvReqRetry is an indication that the resource on which this
1378  // port was stalling on is freed. So, remove the stall first
1379  unstallPort();
1380 
1381  for (int i = 0; i < len; ++i) {
1382  PacketPtr pkt = retries.front();
1383  Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1384  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1385 
1386  if (!sendTimingReq(pkt)) {
1387  stallPort(); // Stall port
1388  DPRINTF(GPUTLB, ": failed again\n");
1389  break;
1390  } else {
1391  DPRINTF(GPUTLB, ": successful\n");
1392  retries.pop_front();
1393  }
1394  }
1395 }
1396 
1397 void
1399 {
1401 
1402  vALUInsts
1403  .name(name() + ".valu_insts")
1404  .desc("Number of vector ALU insts issued.")
1405  ;
1407  .name(name() + ".valu_insts_per_wf")
1408  .desc("The avg. number of vector ALU insts issued per-wavefront.")
1409  ;
1410  sALUInsts
1411  .name(name() + ".salu_insts")
1412  .desc("Number of scalar ALU insts issued.")
1413  ;
1415  .name(name() + ".salu_insts_per_wf")
1416  .desc("The avg. number of scalar ALU insts issued per-wavefront.")
1417  ;
1419  .name(name() + ".inst_cycles_valu")
1420  .desc("Number of cycles needed to execute VALU insts.")
1421  ;
1423  .name(name() + ".inst_cycles_salu")
1424  .desc("Number of cycles needed to execute SALU insts.")
1425  ;
1427  .name(name() + ".thread_cycles_valu")
1428  .desc("Number of thread cycles used to execute vector ALU ops. "
1429  "Similar to instCyclesVALU but multiplied by the number of "
1430  "active threads.")
1431  ;
1433  .name(name() + ".valu_utilization")
1434  .desc("Percentage of active vector ALU threads in a wave.")
1435  ;
1437  .name(name() + ".lds_no_flat_insts")
1438  .desc("Number of LDS insts issued, not including FLAT "
1439  "accesses that resolve to LDS.")
1440  ;
1442  .name(name() + ".lds_no_flat_insts_per_wf")
1443  .desc("The avg. number of LDS insts (not including FLAT "
1444  "accesses that resolve to LDS) per-wavefront.")
1445  ;
1447  .name(name() + ".flat_vmem_insts")
1448  .desc("The number of FLAT insts that resolve to vmem issued.")
1449  ;
1451  .name(name() + ".flat_vmem_insts_per_wf")
1452  .desc("The average number of FLAT insts that resolve to vmem "
1453  "issued per-wavefront.")
1454  ;
1455  flatLDSInsts
1456  .name(name() + ".flat_lds_insts")
1457  .desc("The number of FLAT insts that resolve to LDS issued.")
1458  ;
1460  .name(name() + ".flat_lds_insts_per_wf")
1461  .desc("The average number of FLAT insts that resolve to LDS "
1462  "issued per-wavefront.")
1463  ;
1465  .name(name() + ".vector_mem_writes")
1466  .desc("Number of vector mem write insts (excluding FLAT insts).")
1467  ;
1469  .name(name() + ".vector_mem_writes_per_wf")
1470  .desc("The average number of vector mem write insts "
1471  "(excluding FLAT insts) per-wavefront.")
1472  ;
1474  .name(name() + ".vector_mem_reads")
1475  .desc("Number of vector mem read insts (excluding FLAT insts).")
1476  ;
1478  .name(name() + ".vector_mem_reads_per_wf")
1479  .desc("The avg. number of vector mem read insts (excluding "
1480  "FLAT insts) per-wavefront.")
1481  ;
1483  .name(name() + ".scalar_mem_writes")
1484  .desc("Number of scalar mem write insts.")
1485  ;
1487  .name(name() + ".scalar_mem_writes_per_wf")
1488  .desc("The average number of scalar mem write insts per-wavefront.")
1489  ;
1491  .name(name() + ".scalar_mem_reads")
1492  .desc("Number of scalar mem read insts.")
1493  ;
1495  .name(name() + ".scalar_mem_reads_per_wf")
1496  .desc("The average number of scalar mem read insts per-wavefront.")
1497  ;
1498 
1501  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
1509 
1510  tlbCycles
1511  .name(name() + ".tlb_cycles")
1512  .desc("total number of cycles for all uncoalesced requests")
1513  ;
1514 
1515  tlbRequests
1516  .name(name() + ".tlb_requests")
1517  .desc("number of uncoalesced requests")
1518  ;
1519 
1520  tlbLatency
1521  .name(name() + ".avg_translation_latency")
1522  .desc("Avg. translation latency for data translations")
1523  ;
1524 
1526 
1528  .init(4)
1529  .name(name() + ".TLB_hits_distribution")
1530  .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
1531  ;
1532 
1533  // fixed number of TLB levels
1534  for (int i = 0; i < 4; ++i) {
1535  if (!i)
1536  hitsPerTLBLevel.subname(i,"page_table");
1537  else
1538  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
1539  }
1540 
1541  execRateDist
1542  .init(0, 10, 2)
1543  .name(name() + ".inst_exec_rate")
1544  .desc("Instruction Execution Rate: Number of executed vector "
1545  "instructions per cycle")
1546  ;
1547 
1549  .init(0, wfSize(), 2)
1550  .name(name() + ".lds_bank_conflicts")
1551  .desc("Number of bank conflicts per LDS memory packet")
1552  ;
1553 
1555  .name(name() + ".lds_bank_access_cnt")
1556  .desc("Total number of LDS bank accesses")
1557  ;
1558 
1560  // A wavefront can touch up to N pages per memory instruction where
1561  // N is equal to the wavefront size
1562  // The number of pages per bin can be configured (here it's 4).
1563  .init(1, wfSize(), 4)
1564  .name(name() + ".page_divergence_dist")
1565  .desc("pages touched per wf (over all mem. instr.)")
1566  ;
1567 
1569  .init(1, wfSize(), 4)
1570  .name(name() + ".warp_execution_dist")
1571  .desc("number of lanes active per instruction (oval all instructions)")
1572  ;
1573 
1575  .init(1, wfSize(), 4)
1576  .name(name() + ".gmem_lanes_execution_dist")
1577  .desc("number of active lanes per global memory instruction")
1578  ;
1579 
1581  .init(1, wfSize(), 4)
1582  .name(name() + ".lmem_lanes_execution_dist")
1583  .desc("number of active lanes per local memory instruction")
1584  ;
1585 
1587  .name(name() + ".num_instr_executed")
1588  .desc("number of instructions executed")
1589  ;
1590 
1592  .name(name() + ".num_vec_ops_executed")
1593  .desc("number of vec ops executed (e.g. WF size/inst)")
1594  ;
1595 
1596  totalCycles
1597  .name(name() + ".num_total_cycles")
1598  .desc("number of cycles the CU ran for")
1599  ;
1600 
1601  ipc
1602  .name(name() + ".ipc")
1603  .desc("Instructions per cycle (this CU only)")
1604  ;
1605 
1606  vpc
1607  .name(name() + ".vpc")
1608  .desc("Vector Operations per cycle (this CU only)")
1609  ;
1610 
1612  .name(name() + ".num_alu_insts_executed")
1613  .desc("Number of dynamic non-GM memory insts executed")
1614  ;
1615 
1617  .name(name() + ".wg_blocked_due_lds_alloc")
1618  .desc("Workgroup blocked due to LDS capacity")
1619  ;
1620 
1623 
1625  .name(name() + ".times_wg_blocked_due_vgpr_alloc")
1626  .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
1627  ;
1628 
1630  .name(name() + ".global_mem_instr_cnt")
1631  .desc("dynamic global memory instructions count")
1632  ;
1633 
1635  .name(name() + ".local_mem_instr_cnt")
1636  .desc("dynamic local memory intruction count")
1637  ;
1638 
1641 
1642  completedWfs
1643  .name(name() + ".num_completed_wfs")
1644  .desc("number of completed wavefronts")
1645  ;
1646 
1647  numCASOps
1648  .name(name() + ".num_CAS_ops")
1649  .desc("number of compare and swap operations")
1650  ;
1651 
1653  .name(name() + ".num_failed_CAS_ops")
1654  .desc("number of compare and swap operations that failed")
1655  ;
1656 
1657  // register stats of pipeline stages
1658  fetchStage.regStats();
1661  execStage.regStats();
1662 
1663  // register stats of memory pipeline
1666 }
1667 
1668 void
1670 {
1671  if (gpuDynInst->isScalar()) {
1672  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1673  sALUInsts++;
1674  instCyclesSALU++;
1675  } else if (gpuDynInst->isLoad()) {
1676  scalarMemReads++;
1677  } else if (gpuDynInst->isStore()) {
1678  scalarMemWrites++;
1679  }
1680  } else {
1681  if (gpuDynInst->isALU()) {
1682  vALUInsts++;
1683  instCyclesVALU++;
1684  threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
1685  } else if (gpuDynInst->isFlat()) {
1686  if (gpuDynInst->isLocalMem()) {
1687  flatLDSInsts++;
1688  } else {
1689  flatVMemInsts++;
1690  }
1691  } else if (gpuDynInst->isLocalMem()) {
1692  ldsNoFlatInsts++;
1693  } else if (gpuDynInst->isLoad()) {
1694  vectorMemReads++;
1695  } else if (gpuDynInst->isStore()) {
1696  vectorMemWrites++;
1697  }
1698  }
1699 }
1700 
1701 void
1703 {
1704  Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
1705 
1706  if (!pagesTouched.count(virt_page_addr))
1707  pagesTouched[virt_page_addr] = 1;
1708  else
1709  pagesTouched[virt_page_addr]++;
1710 }
1711 
1712 void
1714 {
1715  if (computeUnit->countPages) {
1716  std::ostream *page_stat_file =
1717  simout.create(computeUnit->name().c_str())->stream();
1718 
1719  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
1720  std::endl;
1721 
1722  for (auto iter : computeUnit->pageAccesses) {
1723  *page_stat_file << std::hex << iter.first << ",";
1724  *page_stat_file << std::dec << iter.second.first << ",";
1725  *page_stat_file << std::dec << iter.second.second << std::endl;
1726  }
1727  }
1728  }
1729 
1730 bool
1732 {
1733  for (int i = 0; i < numSIMDs; ++i) {
1734  if (!isSimdDone(i)) {
1735  return false;
1736  }
1737  }
1738 
1739  bool glbMemBusRdy = true;
1740  for (int j = 0; j < numGlbMemUnits; ++j) {
1741  glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
1742  }
1743  bool locMemBusRdy = true;
1744  for (int j = 0; j < numLocMemUnits; ++j) {
1745  locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
1746  }
1747 
1752  !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
1753  return false;
1754  }
1755 
1756  return true;
1757 }
1758 
1759 int32_t
1760 ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
1761 {
1762  return lds.getRefCounter(dispatchId, wgId);
1763 }
1764 
1765 bool
1766 ComputeUnit::isSimdDone(uint32_t simdId) const
1767 {
1768  assert(simdId < numSIMDs);
1769 
1770  for (int i=0; i < numGlbMemUnits; ++i) {
1771  if (!vrfToGlobalMemPipeBus[i].rdy())
1772  return false;
1773  }
1774  for (int i=0; i < numLocMemUnits; ++i) {
1775  if (!vrfToLocalMemPipeBus[i].rdy())
1776  return false;
1777  }
1778  if (!aluPipe[simdId].rdy()) {
1779  return false;
1780  }
1781 
1782  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
1783  if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
1784  return false;
1785  }
1786  }
1787 
1788  return true;
1789 }
1790 
1796 bool
1798 {
1799  // this is just a request to carry the GPUDynInstPtr
1800  // back and forth
1801  Request *newRequest = new Request();
1802  newRequest->setPaddr(0x0);
1803 
1804  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
1805  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
1806 
1807  // This is the SenderState needed upon return
1808  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
1809 
1810  return ldsPort->sendTimingReq(newPacket);
1811 }
1812 
1816 bool
1818 {
1819  const ComputeUnit::LDSPort::SenderState *senderState =
1820  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
1821 
1822  fatal_if(!senderState, "did not get the right sort of sender state");
1823 
1824  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
1825 
1826  delete packet->senderState;
1827  delete packet->req;
1828  delete packet;
1829 
1830  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
1831  return true;
1832 }
1833 
1839 bool
1841 {
1842  ComputeUnit::LDSPort::SenderState *sender_state =
1843  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
1844  fatal_if(!sender_state, "packet without a valid sender state");
1845 
1846  GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
1847 
1848  if (isStalled()) {
1849  fatal_if(retries.empty(), "must have retries waiting to be stalled");
1850 
1851  retries.push(pkt);
1852 
1853  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
1854  computeUnit->cu_id, gpuDynInst->simdId,
1855  gpuDynInst->wfSlotId);
1856  return false;
1857  } else if (!MasterPort::sendTimingReq(pkt)) {
1858  // need to stall the LDS port until a recvReqRetry() is received
1859  // this indicates that there is more space
1860  stallPort();
1861  retries.push(pkt);
1862 
1863  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
1864  computeUnit->cu_id, gpuDynInst->simdId,
1865  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1866  return false;
1867  } else {
1868  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
1869  computeUnit->cu_id, gpuDynInst->simdId,
1870  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1871  return true;
1872  }
1873 }
1874 
1881 void
1883 {
1884  auto queueSize = retries.size();
1885 
1886  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
1887  computeUnit->cu_id, queueSize);
1888 
1889  fatal_if(queueSize < 1,
1890  "why was there a recvReqRetry() with no pending reqs?");
1891  fatal_if(!isStalled(),
1892  "recvReqRetry() happened when the port was not stalled");
1893 
1894  unstallPort();
1895 
1896  while (!retries.empty()) {
1897  PacketPtr packet = retries.front();
1898 
1899  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
1900 
1901  if (!MasterPort::sendTimingReq(packet)) {
1902  // Stall port
1903  stallPort();
1904  DPRINTF(GPUPort, ": LDS send failed again\n");
1905  break;
1906  } else {
1907  DPRINTF(GPUTLB, ": LDS send successful\n");
1908  retries.pop();
1909  }
1910  }
1911 }
uint32_t numVecRegsPerSimd
void updatePageDivergenceDist(Addr addr)
Stats::Formula tlbLatency
Addr roBase
Definition: wavefront.hh:275
uint16_t cRegCount
Definition: qstruct.hh:62
RubyTester::SenderState SenderState
Definition: Check.cc:37
#define DPRINTF(x,...)
Definition: trace.hh:212
const char * description() const
Return a C string describing the event.
bool rdy() const
Definition: misc.hh:70
Tick ticks(int numCycles) const
Definition: shader.hh:91
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
uint32_t workGroupSz[3]
Definition: wavefront.hh:197
Stats::Formula vpc
OutputDirectory simout
Definition: output.cc:65
Stats::Scalar flatLDSInsts
Addr spillBase
Definition: wavefront.hh:263
Bitfield< 30, 0 > index
std::vector< bool > vectorAluInstAvail
void handleResponse(GPUDynInstPtr gpuDynInst)
this method handles responses sent to this GM pipeline by the CU.
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation...
Definition: statistics.hh:358
const Regs::Info & regInfo(Addr daddr)
Definition: sinicreg.hh:184
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch=true, RequestPtr req=nullptr)
uint32_t numCyclesPerLoadTransfer
Definition: packet.hh:73
const std::string & name()
Definition: trace.cc:49
Stats::Formula ipc
bool isDone() const
uint64_t privMemStart
Definition: qstruct.hh:63
Bitfield< 7 > i
Definition: miscregs.hh:1378
WaitClass glbMemToVrfBus
static const int MAX_REGS_FOR_NON_VEC_MEM_INST
Definition: compute_unit.hh:58
std::map< unsigned, waveQueue > xactCasLoadMap
bool debugSegFault
LdsState & lds
void init(ComputeUnit *cu)
std::vector< std::vector< std::pair< Wavefront *, WAVE_STATUS > > > waveStatusList
void init(ComputeUnit *cu)
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:206
Bitfield< 0 > m
Definition: miscregs.hh:1577
#define panic(...)
Definition: misc.hh:153
virtual void recvReqRetry()
Called by the slave port if sendTimingReq was called on this master port (causing recvTimingReq to be...
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState...
Definition: gpu_tlb.hh:343
void fillKernelState(Wavefront *w, NDRange *ndr)
Stats::Vector hitsPerTLBLevel
Stats::Scalar dynamicGMemInstrCnt
ScheduleStage scheduleStage
Definition: compute_unit.hh:98
Stats::Formula flatLDSInstsPerWF
uint32_t barrierCnt
Definition: wavefront.hh:157
Stats::Distribution controlFlowDivergenceDist
Bitfield< 21, 20 > stride
Definition: miscregs.hh:1627
const Addr PageShift
Definition: isa_traits.hh:51
std::vector< std::vector< Wavefront * > > readyList
int maxBarCnt
Definition: wavefront.hh:254
uint32_t gridSz[3]
Definition: wavefront.hh:198
Stats::Scalar vectorMemWrites
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
bool hasSize() const
Accessor for size.
Definition: request.hh:546
ip6_addr_t addr
Definition: inet.hh:335
bool isWrite() const
Definition: packet.hh:503
void regStats()
Register statistics for this object.
VectorMask initMask
Definition: wavefront.hh:250
GpuDispatcher * dispatcher
Definition: shader.hh:165
bool isAcquire() const
Definition: request.hh:779
uint32_t wgSz
Definition: wavefront.hh:200
virtual Process * getProcessPtr()=0
uint16_t sRegCount
Definition: qstruct.hh:60
int wfSize() const
uint32_t spillWidth
Definition: wavefront.hh:267
int simdId
Definition: wavefront.hh:165
bool dropFetch
Definition: wavefront.hh:172
CUExitCallback * cuExitCallback
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
Definition: wavefront.cc:783
const char * description() const
Return a C string describing the event.
Addr getPC() const
Accessor function for pc.
Definition: request.hh:715
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the slave port by calling its corresponding receive function...
Definition: port.cc:180
void init(ComputeUnit *cu)
uint32_t dispatchId
Definition: wavefront.hh:208
std::vector< DTLBPort * > tlbPort
int kernId
Definition: wavefront.hh:163
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
std::vector< std::vector< Wavefront * > > wfList
this represents a slice of the overall LDS, intended to be associated with an individual workgroup ...
Definition: lds_state.hh:58
uint64_t code_ptr
Definition: qstruct.hh:55
void updateEvents()
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the slave port.
int wfSlotId
Definition: wavefront.hh:162
Stats::Scalar dynamicLMemInstrCnt
bool stalledAtBarrier
Definition: wavefront.hh:256
SenderState is information carried along with the packet throughout the TLB hierarchy.
LdsChunk * ldsChunk
Definition: wavefront.hh:260
Stats::Formula numALUInstsExecuted
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:959
GPUDynInstPtr getMemInst() const
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
GPUStaticInst * kernelLaunchInst
#define DPRINTFN(...)
Definition: trace.hh:216
Stats::Scalar numInstrExecuted
bool isRelease() const
Definition: request.hh:780
Stats::Scalar vALUInsts
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1118
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:909
Stats::Distribution ldsBankConflictDist
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
SenderState is information carried along with the packet throughout the TLB hierarchy.
std::vector< WaitClass > vrfToLocalMemPipeBus
Stats::Formula vectorMemWritesPerWF
Stats::Scalar wgBlockedDueLdsAllocation
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
uint32_t workGroupId[3]
Definition: wavefront.hh:196
std::vector< WaitClass > aluPipe
uint32_t numCyclesPerStoreTransfer
uint64_t wfDynId
Definition: wavefront.hh:282
Bitfield< 5, 0 > status
Definition: miscregs.hh:1604
uint32_t barrierSlots
Definition: wavefront.hh:159
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, NDRange *ndr)
ComputeUnit(const Params *p)
Definition: compute_unit.cc:59
GlobalMemPipeline globalMemoryPipe
uint32_t coalescerToVrfBusWidth
Stats::Formula vALUUtilization
void exec()
Definition: fetch_stage.cc:68
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Formula scalarMemWritesPerWF
Stats::Scalar numTimesWgBlockedDueVgprAlloc
system
Definition: isa.cc:226
bool functionalTLB
std::vector< uint32_t > workItemId[3]
Definition: wavefront.hh:193
Tick curTick()
The current simulated tick.
Definition: core.hh:47
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:169
Stats::Distribution execRateDist
Stats::Formula vectorMemReadsPerWF
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
int funcargs_size
Definition: shader.hh:143
std::vector< std::pair< uint32_t, uint32_t > > regIdxVec
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:161
bool isGMLdRespFIFOWrRdy() const
void notifyWgCompl(Wavefront *w)
Definition: dispatcher.cc:306
bool isLMRespFIFOWrRdy() const
uint32_t actualWgSz[3]
Definition: wavefront.hh:202
uint32_t wfId
Definition: wavefront.hh:206
Addr privBase
Definition: wavefront.hh:270
std::vector< uint32_t > workItemFlatId
Definition: wavefront.hh:194
Stats::Distribution pageDivergenceDist
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:324
ExecStage execStage
Definition: compute_unit.hh:99
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
void regStats()
Definition: fetch_stage.cc:98
bool translate(Addr vaddr, Addr &paddr)
Translate function.
Definition: page_table.cc:173
Stats::Scalar tlbRequests
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the slave port.
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:453
std::vector< WaitClass > vrfToGlobalMemPipeBus
void updateInstStats(GPUDynInstPtr gpuDynInst)
Bitfield< 23 > k
Definition: dt_constants.hh:80
std::vector< int > barCnt
Definition: wavefront.hh:253
Bitfield< 9 > d
Definition: miscregs.hh:1375
uint32_t wgId
Definition: wavefront.hh:199
Stats::Scalar flatVMemInsts
int numWg[3]
Definition: ndrange.hh:50
Stats::Scalar numCASOps
Addr getPaddr() const
Definition: request.hh:519
virtual void recvReqRetry()
Called by the slave port if sendTimingReq was called on this master port (causing recvTimingReq to be...
#define fatal(...)
Definition: misc.hh:163
const RequestPtr req
A pointer to the original request.
Definition: packet.hh:304
void exec()
Definition: exec_stage.cc:128
uint32_t actualWgSzTotal
Definition: wavefront.hh:203
uint32_t ldsSize
Definition: qstruct.hh:72
std::vector< DataPort * > memPort
The memory port for SIMD data accesses.
void registerExitCallback(Callback *callback)
Register an exit callback.
Definition: core.cc:116
std::vector< std::vector< Addr > > lastVaddrSimd
int getAsid() const
Accessor function for asid.
Definition: request.hh:642
bool isPowerOf2(const T &n)
Definition: intmath.hh:73
uint32_t vrfToCoalescerBusWidth
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void computeActualWgSz(NDRange *ndr)
Definition: wavefront.cc:982
T roundDown(const T &val, const U &align)
Definition: intmath.hh:213
void StartWorkgroup(NDRange *ndr)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2534
Stats::Formula sALUInstsPerWF
void init(ComputeUnit *cu)
Definition: exec_stage.cc:53
uint32_t wgSize[3]
Definition: qstruct.hh:59
STL list class.
Definition: stl.hh:54
uint32_t gdSize[3]
Definition: qstruct.hh:57
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:289
ThreadContext * gpuTc
Definition: shader.hh:99
bool isRead() const
Definition: packet.hh:502
Stats::Scalar scalarMemWrites
Stats::Scalar scalarMemReads
Bitfield< 0 > w
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142
Packet::SenderState * saved
Definition: gpu_tlb.hh:368
#define ULL(N)
uint64_t constant
Definition: types.hh:50
Stats::Scalar ldsNoFlatInsts
T safe_cast(U ptr)
Definition: cast.hh:61
std::vector< std::pair< Wavefront *, DISPATCH_STATUS > > dispatchList
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:245
uint64_t roMemStart
Definition: qstruct.hh:69
bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result))
send a general request to the LDS make sure to look at the return value here as your request might be...
bool cedeSIMD(int simdId, int wfSlotId)
uint8_t args[KER_ARGS_LENGTH]
Definition: qstruct.hh:93
Stats::Scalar instCyclesVALU
const Addr PageBytes
Definition: isa_traits.hh:52
Tick resp_tick_latency
PageTableBase * pTable
Definition: process.hh:178
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the slave port.
Stats::Scalar completedWfs
bool xact_cas_mode
uint32_t outstandingReqs
Definition: wavefront.hh:210
Bitfield< 24 > j
Definition: miscregs.hh:1369
uint32_t globalWgId
Definition: ndrange.hh:57
uint32_t privSizePerItem
Definition: wavefront.hh:272
static const int NumArgumentRegs M5_VAR_USED
Definition: process.cc:83
Flags getFlags()
Accessor for flags.
Definition: request.hh:584
Mode
Definition: tlb.hh:61
MasterID masterId() const
Accesssor for the requestor id.
Definition: request.hh:624
Stats::Formula scalarMemReadsPerWF
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:254
Stats::Formula vALUInstsPerWF
bool fixupStackFault(Addr vaddr)
Attempt to fix up a fault at vaddr by allocating a page on the stack.
Definition: process.cc:338
uint32_t privMemPerItem
Definition: qstruct.hh:64
Shader * shader
bool isKernel() const
Definition: request.hh:781
int size()
Definition: pagetable.hh:146
uint64_t spillMemStart
Definition: qstruct.hh:66
Stats::Distribution activeLanesPerGMemInstrDist
virtual const std::string name() const
Definition: sim_object.hh:117
uint32_t oldBarrierCnt
Definition: wavefront.hh:156
bool timingSim
Definition: shader.hh:116
Declarations of a non-full system Page Table.
ComputeUnit * computeUnit
bool pendingFetch
Definition: wavefront.hh:171
void init(uint64_t *_tcnt, uint32_t _numStages=0)
Definition: misc.hh:54
Stats::Scalar tlbCycles
SenderState is information carried along with the packet, esp.
int reservedVectorRegs
Definition: wavefront.hh:230
uint32_t startVgprIndex
Definition: wavefront.hh:233
uint32_t roSize
Definition: wavefront.hh:277
Stats::Scalar numVecOpsExecuted
std::vector< VectorRegisterFile * > vrf
SenderState * senderState
This packet's sender state.
Definition: packet.hh:454
uint32_t spillSizePerItem
Definition: wavefront.hh:265
MemCmd cmd
The command field of the packet.
Definition: packet.hh:301
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition: wavefront.cc:157
void init(ComputeUnit *cu)
The MemObject class extends the ClockedObject with accessor functions to get its master and slave por...
Definition: mem_object.hh:60
Addr getVaddr() const
Definition: request.hh:616
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:357
void setVirt(int asid, Addr vaddr, unsigned size, Flags flags, MasterID mid, Addr pc)
Set up a virtual (e.g., CPU) request in a previously allocated Request object.
Definition: request.hh:460
Stats::Scalar numFailedCASOps
T divCeil(const T &a, const U &b)
Definition: intmath.hh:198
int ReadyWorkgroup(NDRange *ndr)
Bitfield< 18, 16 > len
Definition: miscregs.hh:1626
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
void init(ComputeUnit *cu)
Definition: fetch_stage.cc:56
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:227
std::map< Addr, int > pagesTouched
void schedule(Event &event, Tick when)
Definition: eventq.hh:728
Stats::Scalar instCyclesSALU
virtual void process()
virtual process function that is invoked when the callback queue is executed.
virtual void recvReqRetry()
Called by the slave port if sendTimingReq was called on this master port (causing recvTimingReq to be...
HsaQueueEntry q
Definition: ndrange.hh:45
uint32_t roMemTotal
Definition: qstruct.hh:70
WaitClass locMemToVrfBus
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Definition: wavefront.cc:142
FetchStage fetchStage
Definition: compute_unit.hh:96
Stats::Formula flatVMemInstsPerWF
bool isSimdDone(uint32_t) const
std::vector< uint8_t > statusVec
uint32_t barrier_id
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:287
std::vector< uint64_t > lastExecCycle
The request should be marked with KERNEL.
Definition: request.hh:172
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the slave port.
uint32_t spillMemPerItem
Definition: qstruct.hh:67
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
unsigned getSize() const
Definition: packet.hh:649
uint16_t dRegCount
Definition: qstruct.hh:61
fatal_if(p->js_features.size() > 16,"Too many job slot feature registers specified (%i)\n", p->js_features.size())
Bitfield< 2 > pf
Definition: misc.hh:551
int impl_kern_boundary_sync
Definition: shader.hh:120
unsigned getSize() const
Definition: request.hh:552
Stats::Scalar sALUInsts
Stats::Scalar ldsBankAccesses
uint32_t barrierId
Definition: wavefront.hh:158
Tick req_tick_latency
Stats::Scalar totalCycles
uint64_t tick_cnt
Definition: shader.hh:161
void regStats()
Definition: exec_stage.cc:152
LDSPort * ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > timestampVec
Stats::Scalar vectorMemReads
void setPaddr(Addr paddr)
Set just the physical address.
Definition: request.hh:487
Bitfield< 0 > p
std::vector< Addr > lastVaddrCU
int n_wf
Definition: shader.hh:131
int wgId[3]
Definition: ndrange.hh:48
void setFlags(Flags flags)
Note that unlike other accessors, this function sets specific flags (ORs them in); it does not assign...
Definition: request.hh:595
void setParent(ComputeUnit *x_parent)
set the parent and name based on the parent
Definition: lds_state.cc:81
ComputeUnitParams Params
Stats::Formula ldsNoFlatInstsPerWF
void regStats() override
Register statistics for this object.
uint64_t getAndIncSeqNum()
status_e status
Definition: wavefront.hh:160
Stats::Scalar threadCyclesVALU
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
MasterID masterId()
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
bool isGMStRespFIFOWrRdy() const
bool hasPaddr() const
Accessor for paddr.
Definition: request.hh:513
std::vector< int > vectorRegsReserved
EXEC_POLICY exec_policy
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:102
Addr getAddr() const
Definition: packet.hh:639
int dispatchId
Definition: ndrange.hh:66
virtual void recvReqRetry()
Called by the slave port if sendTimingReq was called on this master port (causing recvTimingReq to be...
ScoreboardCheckStage scoreboardCheckStage
Definition: compute_unit.hh:97
uint8_t * kernelArgs
Definition: wavefront.hh:280

Generated on Fri Jun 9 2017 13:03:47 for gem5 by doxygen 1.8.6