gem5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
wavefront.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its contributors
18  * may be used to endorse or promote products derived from this software
19  * without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * Author: Lisa Hsu
34  */
35 
36 #include "gpu-compute/wavefront.hh"
37 
38 #include "debug/GPUExec.hh"
39 #include "debug/WavefrontStack.hh"
42 #include "gpu-compute/shader.hh"
44 
45 Wavefront*
46 WavefrontParams::create()
47 {
48  return new Wavefront(this);
49 }
50 
52  : SimObject(p), callArgMem(nullptr), _gpuISA()
53 {
54  lastTrace = 0;
55  simdId = p->simdId;
56  wfSlotId = p->wf_slot_id;
57  status = S_STOPPED;
59  startVgprIndex = 0;
60  outstandingReqs = 0;
61  memReqsInPipe = 0;
66  rdLmReqsInPipe = 0;
67  rdGmReqsInPipe = 0;
68  wrLmReqsInPipe = 0;
69  wrGmReqsInPipe = 0;
70 
71  barrierCnt = 0;
72  oldBarrierCnt = 0;
73  stalledAtBarrier = false;
74 
75  memTraceBusy = 0;
76  oldVgprTcnt = 0xffffffffffffffffll;
77  oldDgprTcnt = 0xffffffffffffffffll;
78  oldVgpr.resize(p->wfSize);
79 
80  pendingFetch = false;
81  dropFetch = false;
83  maxSpVgprs = 0;
84  maxDpVgprs = 0;
85  lastAddr.resize(p->wfSize);
86  workItemFlatId.resize(p->wfSize);
87  oldDgpr.resize(p->wfSize);
88  barCnt.resize(p->wfSize);
89  for (int i = 0; i < 3; ++i) {
90  workItemId[i].resize(p->wfSize);
91  }
92 }
93 
94 void
96 {
98 
100  .init(0, 4, 2)
101  .name(name() + ".src_reg_operand_dist")
102  .desc("number of executed instructions with N source register operands")
103  ;
104 
106  .init(0, 3, 2)
107  .name(name() + ".dst_reg_operand_dist")
108  .desc("number of executed instructions with N destination register "
109  "operands")
110  ;
111 
112  // FIXME: the name of the WF needs to be unique
114  .name(name() + ".timesBlockedDueWAXDependencies")
115  .desc("number of times the wf's instructions are blocked due to WAW "
116  "or WAR dependencies")
117  ;
118 
119  // FIXME: the name of the WF needs to be unique
121  .name(name() + ".timesBlockedDueRAWDependencies")
122  .desc("number of times the wf's instructions are blocked due to RAW "
123  "dependencies")
124  ;
125 
126  // FIXME: the name of the WF needs to be unique
128  .name(name() + ".timesBlockedDueVrfPortAvail")
129  .desc("number of times instructions are blocked due to VRF port "
130  "availability")
131  ;
132 }
133 
134 void
136 {
137  reservedVectorRegs = 0;
138  startVgprIndex = 0;
139 }
140 
141 void
142 Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
143 {
144  condRegState->init(num_cregs);
145  maxSpVgprs = num_sregs;
146  maxDpVgprs = num_dregs;
147 }
148 
150 {
151  if (callArgMem)
152  delete callArgMem;
153  delete condRegState;
154 }
155 
156 void
157 Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
158 {
159  wfDynId = _wf_dyn_id;
160  basePtr = _base_ptr;
161  status = S_RUNNING;
162 }
163 
164 bool
166 {
167  if (ii->isGlobalMem() || ii->isFlat())
168  return true;
169 
170  return false;
171 }
172 
173 bool
175 {
176  if (ii->isLocalMem()) {
177  return true;
178  }
179 
180  return false;
181 }
182 
183 bool
185 {
186  assert(!instructionBuffer.empty());
187  GPUDynInstPtr ii = instructionBuffer.front();
188 
189  if (status != S_STOPPED && (ii->isNop() ||
190  ii->isReturn() || ii->isBranch() ||
191  ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
192  return true;
193  }
194 
195  return false;
196 }
197 
198 bool
200 {
201  assert(!instructionBuffer.empty());
202  GPUDynInstPtr ii = instructionBuffer.front();
203 
204  if (status != S_STOPPED && ii->isBarrier()) {
205  return true;
206  }
207 
208  return false;
209 }
210 
211 bool
213 {
214  assert(!instructionBuffer.empty());
215  GPUDynInstPtr ii = instructionBuffer.front();
216 
217  if (status != S_STOPPED && ii->isGlobalMem()) {
218  return true;
219  }
220 
221  return false;
222 }
223 
224 bool
226 {
227  assert(!instructionBuffer.empty());
228  GPUDynInstPtr ii = instructionBuffer.front();
229 
230  if (status != S_STOPPED && ii->isLocalMem()) {
231  return true;
232  }
233 
234  return false;
235 }
236 
237 bool
239 {
240  assert(!instructionBuffer.empty());
241  GPUDynInstPtr ii = instructionBuffer.front();
242 
243  if (status != S_STOPPED && ii->isPrivateSeg()) {
244  return true;
245  }
246 
247  return false;
248 }
249 
250 bool
252 {
253  assert(!instructionBuffer.empty());
254  GPUDynInstPtr ii = instructionBuffer.front();
255 
256  if (status != S_STOPPED && ii->isFlat()) {
257  return true;
258  }
259 
260  return false;
261 }
262 
263 // Return true if the Wavefront's instruction
264 // buffer has branch instruction.
265 bool
267 {
268  for (auto it : instructionBuffer) {
269  GPUDynInstPtr ii = it;
270 
271  if (ii->isReturn() || ii->isBranch()) {
272  return true;
273  }
274  }
275 
276  return false;
277 }
278 
279 // Remap HSAIL register to physical VGPR.
280 // HSAIL register = virtual register assigned to an operand by HLC compiler
281 uint32_t
282 Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
283 {
284  assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
285  // add the offset from where the VGPRs of the wavefront have been assigned
286  uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
287  // HSAIL double precision (DP) register: calculate the physical VGPR index
288  // assuming that DP registers are placed after SP ones in the VRF. The DP
289  // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
290  // the DP VGPR index before mapping it to the physical VRF address space
291  if (mode == 1 && size > 4) {
292  physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
293  }
294 
295  assert((startVgprIndex <= physicalVgprIndex) &&
296  (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
297 
298  // calculate absolute physical VGPR index
299  return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
300 }
301 
302 // Return true if this wavefront is ready
303 // to execute an instruction of the specified type.
304 int
306 {
307  // Check to make sure wave is running
308  if (status == S_STOPPED || status == S_RETURNING ||
309  instructionBuffer.empty()) {
310  return 0;
311  }
312 
313  // Is the wave waiting at a barrier
314  if (stalledAtBarrier) {
317  // Are all threads at barrier?
318  return 0;
319  }
321  stalledAtBarrier = false;
322  }
323 
324  // Read instruction
325  GPUDynInstPtr ii = instructionBuffer.front();
326 
327  bool ready_inst M5_VAR_USED = false;
328  bool glbMemBusRdy = false;
329  bool glbMemIssueRdy = false;
330  if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
331  for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
332  if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
333  glbMemBusRdy = true;
334  if (computeUnit->wfWait[j].prerdy())
335  glbMemIssueRdy = true;
336  }
337  }
338  bool locMemBusRdy = false;
339  bool locMemIssueRdy = false;
340  if (type == I_SHARED || type == I_FLAT) {
341  for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
342  if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
343  locMemBusRdy = true;
344  if (computeUnit->wfWait[j].prerdy())
345  locMemIssueRdy = true;
346  }
347  }
348 
349  // The following code is very error prone and the entire process for
350  // checking readiness will be fixed eventually. In the meantime, let's
351  // make sure that we do not silently let an instruction type slip
352  // through this logic and always return not ready.
353  if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
354  ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
355  ii->isMemFence() || ii->isFlat())) {
356  panic("next instruction: %s is of unknown type\n", ii->disassemble());
357  }
358 
359  DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
360  computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
361 
362  if (type == I_ALU && ii->isBarrier()) {
363  // Here for ALU instruction (barrier)
364  if (!computeUnit->wfWait[simdId].prerdy()) {
365  // Is wave slot free?
366  return 0;
367  }
368 
369  // Are there in pipe or outstanding memory requests?
370  if ((outstandingReqs + memReqsInPipe) > 0) {
371  return 0;
372  }
373 
374  ready_inst = true;
375  } else if (type == I_ALU && ii->isNop()) {
376  // Here for ALU instruction (nop)
377  if (!computeUnit->wfWait[simdId].prerdy()) {
378  // Is wave slot free?
379  return 0;
380  }
381 
382  ready_inst = true;
383  } else if (type == I_ALU && ii->isReturn()) {
384  // Here for ALU instruction (return)
385  if (!computeUnit->wfWait[simdId].prerdy()) {
386  // Is wave slot free?
387  return 0;
388  }
389 
390  // Are there in pipe or outstanding memory requests?
391  if ((outstandingReqs + memReqsInPipe) > 0) {
392  return 0;
393  }
394 
395  ready_inst = true;
396  } else if (type == I_ALU && (ii->isBranch() ||
397  ii->isALU() ||
398  (ii->isKernArgSeg() && ii->isLoad()) ||
399  ii->isArgSeg())) {
400  // Here for ALU instruction (all others)
401  if (!computeUnit->wfWait[simdId].prerdy()) {
402  // Is alu slot free?
403  return 0;
404  }
405  if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
407  return 0;
408  }
409 
410  if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
411  return 0;
412  }
413  ready_inst = true;
414  } else if (type == I_GLOBAL && ii->isGlobalMem()) {
415  // Here Global memory instruction
416  if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
417  // Are there in pipe or outstanding global memory write requests?
418  if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
419  return 0;
420  }
421  }
422 
423  if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
424  // Are there in pipe or outstanding global memory read requests?
426  return 0;
427  }
428 
429  if (!glbMemIssueRdy) {
430  // Is WV issue slot free?
431  return 0;
432  }
433 
434  if (!glbMemBusRdy) {
435  // Is there an available VRF->Global memory read bus?
436  return 0;
437  }
438 
440  isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
441  // Can we insert a new request to the Global Mem Request FIFO?
442  return 0;
443  }
444  // can we schedule source & destination operands on the VRF?
445  if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
447  return 0;
448  }
449  if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
450  return 0;
451  }
452  ready_inst = true;
453  } else if (type == I_SHARED && ii->isLocalMem()) {
454  // Here for Shared memory instruction
455  if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
456  if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
457  return 0;
458  }
459  }
460 
461  if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
462  if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
463  return 0;
464  }
465  }
466 
467  if (!locMemBusRdy) {
468  // Is there an available VRF->LDS read bus?
469  return 0;
470  }
471  if (!locMemIssueRdy) {
472  // Is wave slot free?
473  return 0;
474  }
475 
477  isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
478  // Can we insert a new request to the LDS Request FIFO?
479  return 0;
480  }
481  // can we schedule source & destination operands on the VRF?
482  if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
484  return 0;
485  }
486  if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
487  return 0;
488  }
489  ready_inst = true;
490  } else if (type == I_FLAT && ii->isFlat()) {
491  if (!glbMemBusRdy) {
492  // Is there an available VRF->Global memory read bus?
493  return 0;
494  }
495 
496  if (!locMemBusRdy) {
497  // Is there an available VRF->LDS read bus?
498  return 0;
499  }
500 
501  if (!glbMemIssueRdy) {
502  // Is wave slot free?
503  return 0;
504  }
505 
506  if (!locMemIssueRdy) {
507  return 0;
508  }
510  isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
511  // Can we insert a new request to the Global Mem Request FIFO?
512  return 0;
513  }
514 
516  isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
517  // Can we insert a new request to the LDS Request FIFO?
518  return 0;
519  }
520  // can we schedule source & destination operands on the VRF?
521  if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
523  return 0;
524  }
525  // are all the operands ready? (RAW, WAW and WAR depedencies met?)
526  if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
527  return 0;
528  }
529  ready_inst = true;
530  } else {
531  return 0;
532  }
533 
534  assert(ready_inst);
535 
536  DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
537  simdId, wfSlotId, ii->disassemble());
538  return 1;
539 }
540 
541 void
543 {
544  // Get current instruction
545  GPUDynInstPtr ii = instructionBuffer.front();
546  assert(ii);
547  computeUnit->vrf[simdId]->updateResources(this, ii);
548  // Single precision ALU or Branch or Return or Special instruction
549  if (ii->isALU() || ii->isSpecialOp() ||
550  ii->isBranch() ||
551  // FIXME: Kernel argument loads are currently treated as ALU operations
552  // since we don't send memory packets at execution. If we fix that then
553  // we should map them to one of the memory pipelines
554  (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
555  ii->isReturn()) {
557  ticks(computeUnit->spBypassLength()));
558  // this is to enforce a fixed number of cycles per issue slot per SIMD
560  ticks(computeUnit->issuePeriod));
561  } else if (ii->isBarrier()) {
563  ticks(computeUnit->issuePeriod));
564  } else if (ii->isLoad() && ii->isFlat()) {
565  assert(Enums::SC_NONE != ii->executedAs());
566  memReqsInPipe++;
567  rdGmReqsInPipe++;
568  if ( Enums::SC_SHARED == ii->executedAs() ) {
570  preset(computeUnit->shader->ticks(4));
573  } else {
575  preset(computeUnit->shader->ticks(4));
578  }
579  } else if (ii->isStore() && ii->isFlat()) {
580  assert(Enums::SC_NONE != ii->executedAs());
581  memReqsInPipe++;
582  wrGmReqsInPipe++;
583  if (Enums::SC_SHARED == ii->executedAs()) {
585  preset(computeUnit->shader->ticks(8));
588  } else {
590  preset(computeUnit->shader->ticks(8));
593  }
594  } else if (ii->isLoad() && ii->isGlobalMem()) {
595  memReqsInPipe++;
596  rdGmReqsInPipe++;
598  preset(computeUnit->shader->ticks(4));
601  } else if (ii->isStore() && ii->isGlobalMem()) {
602  memReqsInPipe++;
603  wrGmReqsInPipe++;
605  preset(computeUnit->shader->ticks(8));
608  } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
609  memReqsInPipe++;
610  wrGmReqsInPipe++;
611  rdGmReqsInPipe++;
613  preset(computeUnit->shader->ticks(8));
616  } else if (ii->isLoad() && ii->isLocalMem()) {
617  memReqsInPipe++;
618  rdLmReqsInPipe++;
620  preset(computeUnit->shader->ticks(4));
623  } else if (ii->isStore() && ii->isLocalMem()) {
624  memReqsInPipe++;
625  wrLmReqsInPipe++;
627  preset(computeUnit->shader->ticks(8));
630  } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
631  memReqsInPipe++;
632  wrLmReqsInPipe++;
633  rdLmReqsInPipe++;
635  preset(computeUnit->shader->ticks(8));
638  }
639 }
640 
641 void
643 {
644  // ---- Exit if wavefront is inactive ----------------------------- //
645 
646  if (status == S_STOPPED || status == S_RETURNING ||
647  instructionBuffer.empty()) {
648  return;
649  }
650 
651  // Get current instruction
652 
653  GPUDynInstPtr ii = instructionBuffer.front();
654 
655  const uint32_t old_pc = pc();
656  DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
657  "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
658  ii->disassemble(), old_pc);
659 
660  // update the instruction stats in the CU
661 
662  ii->execute(ii);
664  // access the VRF
665  computeUnit->vrf[simdId]->exec(ii, this);
666  srcRegOpDist.sample(ii->numSrcRegOperands());
667  dstRegOpDist.sample(ii->numDstRegOperands());
672  if (pc() == old_pc) {
673  uint32_t new_pc = _gpuISA.advancePC(old_pc, ii);
674  // PC not modified by instruction, proceed to next or pop frame
675  pc(new_pc);
676  if (new_pc == rpc()) {
678  discardFetch();
679  } else {
680  instructionBuffer.pop_front();
681  }
682  } else {
683  discardFetch();
684  }
685 
687  const int num_active_lanes = execMask().count();
688  computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
689  computeUnit->numVecOpsExecuted += num_active_lanes;
690  if (isGmInstruction(ii)) {
692  } else if (isLmInstruction(ii)) {
694  }
695  }
696 
697  // ---- Update Vector ALU pipeline and other resources ------------------ //
698  // Single precision ALU or Branch or Return or Special instruction
699  if (ii->isALU() || ii->isSpecialOp() ||
700  ii->isBranch() ||
701  // FIXME: Kernel argument loads are currently treated as ALU operations
702  // since we don't send memory packets at execution. If we fix that then
703  // we should map them to one of the memory pipelines
704  (ii->isKernArgSeg() && ii->isLoad()) ||
705  ii->isArgSeg() ||
706  ii->isReturn()) {
708  ticks(computeUnit->spBypassLength()));
709 
710  // this is to enforce a fixed number of cycles per issue slot per SIMD
712  ticks(computeUnit->issuePeriod));
713  } else if (ii->isBarrier()) {
715  ticks(computeUnit->issuePeriod));
716  } else if (ii->isLoad() && ii->isFlat()) {
717  assert(Enums::SC_NONE != ii->executedAs());
718 
719  if (Enums::SC_SHARED == ii->executedAs()) {
721  set(computeUnit->shader->ticks(4));
724  } else {
726  set(computeUnit->shader->ticks(4));
729  }
730  } else if (ii->isStore() && ii->isFlat()) {
731  assert(Enums::SC_NONE != ii->executedAs());
732  if (Enums::SC_SHARED == ii->executedAs()) {
734  set(computeUnit->shader->ticks(8));
737  } else {
739  set(computeUnit->shader->ticks(8));
742  }
743  } else if (ii->isLoad() && ii->isGlobalMem()) {
745  set(computeUnit->shader->ticks(4));
748  } else if (ii->isStore() && ii->isGlobalMem()) {
750  set(computeUnit->shader->ticks(8));
753  } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
755  set(computeUnit->shader->ticks(8));
758  } else if (ii->isLoad() && ii->isLocalMem()) {
760  set(computeUnit->shader->ticks(4));
763  } else if (ii->isStore() && ii->isLocalMem()) {
765  set(computeUnit->shader->ticks(8));
768  } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
770  set(computeUnit->shader->ticks(8));
773  }
774 }
775 
776 bool
778 {
779  return barCnt[lane] < maxBarCnt;
780 }
781 
782 void
784  const VectorMask& mask)
785 {
786  assert(mask.count());
787  reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
788 }
789 
790 void
792 {
793  assert(!reconvergenceStack.empty());
794 
795  DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
797  execMask().to_string<char, std::string::traits_type,
798  std::string::allocator_type>().c_str(), pc());
799 
800  reconvergenceStack.pop_back();
801 
802  DPRINTF(WavefrontStack, "%3i %s\n", pc(),
803  execMask().to_string<char, std::string::traits_type,
804  std::string::allocator_type>().c_str());
805 
806 }
807 
808 void
810 {
811  instructionBuffer.clear();
813 }
814 
815 uint32_t
817 {
818  return reconvergenceStack.back()->pc;
819 }
820 
821 uint32_t
823 {
824  return reconvergenceStack.back()->rpc;
825 }
826 
829 {
830  return reconvergenceStack.back()->execMask;
831 }
832 
833 bool
834 Wavefront::execMask(int lane) const
835 {
836  return reconvergenceStack.back()->execMask[lane];
837 }
838 
839 
840 void
841 Wavefront::pc(uint32_t new_pc)
842 {
843  reconvergenceStack.back()->pc = new_pc;
844 }
845 
846 uint32_t
848 {
849  return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
850  sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
851  sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
852  sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
854 }
855 
856 void
857 Wavefront::getContext(const void *out)
858 {
859  uint8_t *iter = (uint8_t *)out;
860  for (int i = 0; i < barCnt.size(); i++) {
861  *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
862  }
863  *(int *)iter = wfId; iter += sizeof(wfId);
864  *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
865  *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
866  *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
867  *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
868  *(uint32_t *)iter = wgId; iter += sizeof(wgId);
869  *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
870  *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
871  *(Addr *)iter = privBase; iter += sizeof(privBase);
872  *(Addr *)iter = spillBase; iter += sizeof(spillBase);
873 
874  int stackSize = reconvergenceStack.size();
875  ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
876  std::numeric_limits<uint32_t>::max(),
877  std::numeric_limits<uint64_t>::max()};
878  for (int i = 0; i < workItemId[0].size(); i++) {
879  if (i < stackSize) {
880  *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
881  iter += sizeof(ReconvergenceStackEntry);
882  reconvergenceStack.pop_back();
883  } else {
884  *(ReconvergenceStackEntry *)iter = empty;
885  iter += sizeof(ReconvergenceStackEntry);
886  }
887  }
888 
889  int wf_size = computeUnit->wfSize();
890  for (int i = 0; i < maxSpVgprs; i++) {
891  uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
892  for (int lane = 0; lane < wf_size; lane++) {
893  uint32_t regVal = computeUnit->vrf[simdId]->
894  read<uint32_t>(vgprIdx,lane);
895  *(uint32_t *)iter = regVal; iter += sizeof(regVal);
896  }
897  }
898 
899  for (int i = 0; i < maxDpVgprs; i++) {
900  uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
901  for (int lane = 0; lane < wf_size; lane++) {
902  uint64_t regVal = computeUnit->vrf[simdId]->
903  read<uint64_t>(vgprIdx,lane);
904  *(uint64_t *)iter = regVal; iter += sizeof(regVal);
905  }
906  }
907 
908  for (int i = 0; i < condRegState->numRegs(); i++) {
909  for (int lane = 0; lane < wf_size; lane++) {
910  uint64_t regVal = condRegState->read<uint64_t>(i, lane);
911  *(uint64_t *)iter = regVal; iter += sizeof(regVal);
912  }
913  }
914 
915  /* saving LDS content */
916  if (ldsChunk)
917  for (int i = 0; i < ldsChunk->size(); i++) {
918  char val = ldsChunk->read<char>(i);
919  *(char *) iter = val; iter += sizeof(val);
920  }
921 }
922 
923 void
924 Wavefront::setContext(const void *in)
925 {
926  uint8_t *iter = (uint8_t *)in;
927  for (int i = 0; i < barCnt.size(); i++) {
928  barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
929  }
930  wfId = *(int *)iter; iter += sizeof(wfId);
931  maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
932  oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
933  barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
934  computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
935  wgId = *(uint32_t *)iter; iter += sizeof(wgId);
936  barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
937  initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
938  privBase = *(Addr *)iter; iter += sizeof(privBase);
939  spillBase = *(Addr *)iter; iter += sizeof(spillBase);
940 
941  for (int i = 0; i < workItemId[0].size(); i++) {
943  iter += sizeof(ReconvergenceStackEntry);
944  if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
945  pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
946  newEntry.execMask);
947  }
948  }
949  int wf_size = computeUnit->wfSize();
950 
951  for (int i = 0; i < maxSpVgprs; i++) {
952  uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
953  for (int lane = 0; lane < wf_size; lane++) {
954  uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
955  computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
956  }
957  }
958 
959  for (int i = 0; i < maxDpVgprs; i++) {
960  uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
961  for (int lane = 0; lane < wf_size; lane++) {
962  uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
963  computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
964  }
965  }
966 
967  for (int i = 0; i < condRegState->numRegs(); i++) {
968  for (int lane = 0; lane < wf_size; lane++) {
969  uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
970  condRegState->write<uint64_t>(i, lane, regVal);
971  }
972  }
974  if (ldsChunk)
975  for (int i = 0; i < ldsChunk->size(); i++) {
976  char val = *(char *) iter; iter += sizeof(val);
977  ldsChunk->write<char>(i, val);
978  }
979 }
980 
981 void
983 {
984  actualWgSzTotal = 1;
985  for (int d = 0; d < 3; ++d) {
986  actualWgSz[d] = std::min(workGroupSz[d],
987  gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
989  }
990 }
Counter value() const
Return the current value of this stat as its base type.
Definition: statistics.hh:677
std::vector< uint32_t > oldVgpr
Definition: wavefront.hh:236
#define DPRINTF(x,...)
Definition: trace.hh:212
Tick ticks(int numCycles) const
Definition: shader.hh:91
uint32_t workGroupSz[3]
Definition: wavefront.hh:197
void discardFetch()
Definition: wavefront.cc:809
Addr spillBase
Definition: wavefront.hh:263
bool isOldestInstGMem()
Definition: wavefront.cc:212
Stats::Scalar numTimesBlockedDueRAWDependencies
Definition: wavefront.hh:292
Bitfield< 7 > i
Definition: miscregs.hh:1378
#define panic(...)
Definition: misc.hh:153
void setContext(const void *in)
Sets the hardware context fromt a stream of bytes This method is designed for HSAIL execution...
Definition: wavefront.cc:924
void write(int regIdx, int threadId, T value)
uint32_t barrierCnt
Definition: wavefront.hh:157
Stats::Scalar numTimesBlockedDueVrfPortAvail
Definition: wavefront.hh:286
std::deque< std::unique_ptr< ReconvergenceStackEntry > > reconvergenceStack
Stack containing Control Flow Graph nodes (i.e., kernel instructions) to be visited by the wavefront...
Definition: wavefront.hh:392
Stats::Distribution controlFlowDivergenceDist
int maxBarCnt
Definition: wavefront.hh:254
uint32_t gridSz[3]
Definition: wavefront.hh:198
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
VectorMask initMask
Definition: wavefront.hh:250
int wfSize() const
int simdId
Definition: wavefront.hh:165
bool dropFetch
Definition: wavefront.hh:172
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
Definition: wavefront.cc:783
uint32_t dispatchId
Definition: wavefront.hh:208
T read(int regIdx, int threadId)
class ConditionRegisterState * condRegState
Definition: wavefront.hh:175
bool isOldestInstFlatMem()
Definition: wavefront.cc:251
Bitfield< 4, 0 > mode
Definition: miscregs.hh:1385
bool isOldestInstPrivMem()
Definition: wavefront.cc:238
virtual void regStats()
Register statistics for this object.
Definition: sim_object.cc:105
int wfSlotId
Definition: wavefront.hh:162
bool stalledAtBarrier
Definition: wavefront.hh:256
uint32_t maxSpVgprs
Definition: wavefront.hh:177
LdsChunk * ldsChunk
Definition: wavefront.hh:260
Stats::Scalar numTimesBlockedDueWAXDependencies
Definition: wavefront.hh:289
int spBypassLength()
Stats::Scalar numInstrExecuted
uint64_t lastTrace
Definition: wavefront.hh:228
Bitfield< 63 > val
Definition: misc.hh:770
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
std::vector< WaitClass > vrfToLocalMemPipeBus
bool instructionBufferHasBranch()
Definition: wavefront.cc:266
std::vector< WaitClass > aluPipe
uint64_t wfDynId
Definition: wavefront.hh:282
CallArgMem * callArgMem
Definition: wavefront.hh:300
GlobalMemPipeline globalMemoryPipe
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Distribution srcRegOpDist
Definition: wavefront.hh:295
std::vector< uint32_t > workItemId[3]
Definition: wavefront.hh:193
uint32_t pc
PC of current instruction.
Definition: wavefront.hh:66
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:169
Stats::Distribution execRateDist
int ShrMemUnitId()
uint32_t actualWgSz[3]
Definition: wavefront.hh:202
uint32_t wfId
Definition: wavefront.hh:206
uint32_t rdLmReqsInPipe
Definition: wavefront.hh:222
void regStats()
Register statistics for this object.
Definition: wavefront.cc:95
Addr privBase
Definition: wavefront.hh:270
std::vector< uint32_t > workItemFlatId
Definition: wavefront.hh:194
Wavefront(const Params *p)
Definition: wavefront.cc:51
void updateResources()
Definition: wavefront.cc:542
uint32_t getStaticContextSize() const
Returns the size of the static hardware context of a particular wavefront This should be updated ever...
Definition: wavefront.cc:847
std::vector< WaitClass > vrfToGlobalMemPipeBus
void updateInstStats(GPUDynInstPtr gpuDynInst)
std::vector< int > barCnt
Definition: wavefront.hh:253
Bitfield< 9 > d
Definition: miscregs.hh:1375
ComputeUnit * computeUnit
Definition: wavefront.hh:167
uint32_t wgId
Definition: wavefront.hh:199
Stats::Distribution dstRegOpDist
Definition: wavefront.hh:296
uint32_t rdGmReqsInPipe
Definition: wavefront.hh:223
bool isLmInstruction(GPUDynInstPtr ii)
Definition: wavefront.cc:174
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: wavefront.cc:135
uint32_t outstandingReqsWrLm
Definition: wavefront.hh:217
uint32_t actualWgSzTotal
Definition: wavefront.hh:203
void getContext(const void *out)
Returns the hardware context as a stream of bytes This method is designed for HSAIL execution...
Definition: wavefront.cc:857
uint32_t outstandingReqsRdGm
Definition: wavefront.hh:219
int ready(itype_e type)
Definition: wavefront.cc:305
int memTraceBusy
Definition: wavefront.hh:227
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void computeActualWgSz(NDRange *ndr)
Definition: wavefront.cc:982
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2534
T read(const uint32_t index)
a read operation
Definition: lds_state.hh:73
WavefrontParams Params
Definition: wavefront.hh:321
void exec()
Definition: wavefront.cc:642
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142
bool isOldestInstALU()
Definition: wavefront.cc:184
uint32_t outstandingReqsRdLm
Definition: wavefront.hh:221
uint64_t basePtr
Definition: wavefront.hh:154
uint32_t outstandingReqs
Definition: wavefront.hh:210
Bitfield< 24 > j
Definition: miscregs.hh:1369
static const int NumArgumentRegs M5_VAR_USED
Definition: process.cc:83
uint32_t pc() const
Definition: wavefront.cc:816
bool isOldestInstBarrier()
Definition: wavefront.cc:199
uint32_t outstandingReqsWrGm
Definition: wavefront.hh:215
bool isGmInstruction(GPUDynInstPtr ii)
Definition: wavefront.cc:165
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:254
std::vector< Addr > lastAddr
Definition: wavefront.hh:192
Shader * shader
TheGpuISA::GPUISA _gpuISA
Definition: wavefront.hh:384
type
Definition: misc.hh:728
int size()
Definition: pagetable.hh:146
Stats::Distribution activeLanesPerGMemInstrDist
virtual const std::string name() const
Definition: sim_object.hh:117
uint32_t oldBarrierCnt
Definition: wavefront.hh:156
uint64_t oldDgprTcnt
Definition: wavefront.hh:247
bool pendingFetch
Definition: wavefront.hh:171
uint64_t oldVgprTcnt
Definition: wavefront.hh:240
uint32_t memReqsInPipe
Definition: wavefront.hh:213
bool isOldestInstLMem()
Definition: wavefront.cc:225
int reservedVectorRegs
Definition: wavefront.hh:230
uint32_t startVgprIndex
Definition: wavefront.hh:233
VectorMask execMask() const
Definition: wavefront.cc:828
A reconvergence stack entry conveys the necessary state to implement control flow divergence...
Definition: wavefront.hh:62
Stats::Scalar numVecOpsExecuted
uint32_t wrGmReqsInPipe
Definition: wavefront.hh:225
std::vector< VectorRegisterFile * > vrf
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition: wavefront.cc:157
std::vector< uint64_t > oldDgpr
Definition: wavefront.hh:243
void popFromReconvergenceStack()
Definition: wavefront.cc:791
int nextLocRdBus()
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Definition: wavefront.cc:142
IntReg pc
Definition: remote_gdb.hh:91
hsail_mode_e hsail_mode
Definition: shader.hh:117
void write(const uint32_t index, const T value)
a write operation
Definition: lds_state.hh:86
uint32_t wrLmReqsInPipe
Definition: wavefront.hh:224
Bitfield< 3, 0 > mask
Definition: types.hh:64
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:287
std::vector< uint64_t > lastExecCycle
uint32_t maxDpVgprs
Definition: wavefront.hh:179
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
int GlbMemUnitId()
uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0)
Definition: wavefront.cc:282
std::vector< uint8_t >::size_type size() const
get the size of this chunk
Definition: lds_state.hh:98
uint32_t barrierId
Definition: wavefront.hh:158
Stats::Scalar totalCycles
Bitfield< 0 > p
int wgId[3]
Definition: ndrange.hh:48
int nextGlbRdBus()
Abstract superclass for simulation objects.
Definition: sim_object.hh:94
status_e status
Definition: wavefront.hh:160
uint32_t rpc() const
Definition: wavefront.cc:822
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1869
uint32_t rpc
PC of the immediate post-dominator instruction, i.e., the value of pc for the first instruction that ...
Definition: wavefront.hh:72
bool waitingAtBarrier(int lane)
Definition: wavefront.cc:777
VectorMask execMask
Execution mask.
Definition: wavefront.hh:76

Generated on Fri Jun 9 2017 13:03:48 for gem5 by doxygen 1.8.6