gem5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
wavefront.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its contributors
18  * may be used to endorse or promote products derived from this software
19  * without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * Author: Lisa Hsu
34  */
35 
36 #ifndef __WAVEFRONT_HH__
37 #define __WAVEFRONT_HH__
38 
39 #include <cassert>
40 #include <deque>
41 #include <memory>
42 #include <stack>
43 #include <vector>
44 
45 #include "arch/gpu_isa.hh"
46 #include "base/misc.hh"
47 #include "base/types.hh"
48 #include "config/the_gpu_isa.hh"
50 #include "gpu-compute/lds_state.hh"
51 #include "gpu-compute/misc.hh"
52 #include "gpu-compute/ndrange.hh"
53 #include "params/Wavefront.hh"
54 #include "sim/sim_object.hh"
55 
56 static const int MAX_NUM_INSTS_PER_WF = 12;
57 
66  uint32_t pc;
72  uint32_t rpc;
77 };
78 
79 /*
80  * Arguments for the hsail opcode call, are user defined and variable length.
81  * The hardware/finalizer can support arguments in hardware or use memory to
82  * pass arguments. For now, let's assume that an unlimited number of arguments
83  * are supported in hardware (the compiler inlines functions whenver it can
84  * anyways, so unless someone is interested in the implications of linking/
85  * library functions, I think this is a reasonable assumption given the typical
86  * size of an OpenCL kernel).
87  *
88  * Note that call args are different than kernel arguments:
89  * * All work-items in a kernel refer the same set of kernel arguments
90  * * Each work-item has it's on set of call args. So a call argument at
91  * address 0x4 is different for work-item 0 and work-item 1.
92  *
93  * Ok, the table below shows an example of how we organize the call arguments in
94  * the CallArgMem class.
95  *
96  * int foo(int arg1, double arg2)
97  * ___________________________________________________
98  * | 0: return.0 | 4: return.1 | ... | 252: return.63 |
99  * |---------------------------------------------------|
100  * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
101  * |---------------------------------------------------|
102  * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
103  * ___________________________________________________
104  */
106 {
107  public:
108  // pointer to buffer for storing function arguments
109  uint8_t *mem;
110  int wfSize;
111  // size of function args
113 
114  template<typename CType>
115  int
116  getLaneOffset(int lane, int addr)
117  {
118  return addr * wfSize + sizeof(CType) * lane;
119  }
120 
121  CallArgMem(int func_args_size_per_item, int wf_size)
122  : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
123  {
124  mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
125  }
126 
128  {
129  free(mem);
130  }
131 
132  template<typename CType>
133  uint8_t*
134  getLaneAddr(int lane, int addr)
135  {
136  return mem + getLaneOffset<CType>(lane, addr);
137  }
138 
139  template<typename CType>
140  void
141  setLaneAddr(int lane, int addr, CType val)
142  {
143  *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
144  }
145 };
146 
147 class Wavefront : public SimObject
148 {
149  public:
152 
153  // Base pointer for array of instruction pointers
154  uint64_t basePtr;
155 
156  uint32_t oldBarrierCnt;
157  uint32_t barrierCnt;
158  uint32_t barrierId;
159  uint32_t barrierSlots;
161  // HW slot id where the WF is mapped to inside a SIMD unit
162  int wfSlotId;
163  int kernId;
164  // SIMD unit where the WV has been scheduled
165  int simdId;
166  // pointer to parent CU
168 
170 
172  bool dropFetch;
173 
174  // Condition Register State (for HSAIL simulations only)
176  // number of single precision VGPRs required by WF
177  uint32_t maxSpVgprs;
178  // number of double precision VGPRs required by WF
179  uint32_t maxDpVgprs;
180  // map virtual to physical vector register
181  uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
182  void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
185  bool isOldestInstGMem();
186  bool isOldestInstLMem();
187  bool isOldestInstPrivMem();
188  bool isOldestInstFlatMem();
189  bool isOldestInstALU();
190  bool isOldestInstBarrier();
191  // used for passing spill address to DDInstGPU
195  /* kernel launch parameters */
196  uint32_t workGroupId[3];
197  uint32_t workGroupSz[3];
198  uint32_t gridSz[3];
199  uint32_t wgId;
200  uint32_t wgSz;
201  /* the actual WG size can differ than the maximum size */
202  uint32_t actualWgSz[3];
203  uint32_t actualWgSzTotal;
204  void computeActualWgSz(NDRange *ndr);
205  // wavefront id within a workgroup
206  uint32_t wfId;
207  uint32_t maxDynWaveId;
208  uint32_t dispatchId;
209  // outstanding global+local memory requests
210  uint32_t outstandingReqs;
211  // memory requests between scoreboard
212  // and execute stage not yet executed
213  uint32_t memReqsInPipe;
214  // outstanding global memory write requests
216  // outstanding local memory write requests
218  // outstanding global memory read requests
220  // outstanding local memory read requests
222  uint32_t rdLmReqsInPipe;
223  uint32_t rdGmReqsInPipe;
224  uint32_t wrLmReqsInPipe;
225  uint32_t wrGmReqsInPipe;
226 
228  uint64_t lastTrace;
229  // number of vector registers reserved by WF
231  // Index into the Vector Register File's namespace where the WF's registers
232  // will live while the WF is executed
233  uint32_t startVgprIndex;
234 
235  // Old value of destination gpr (for trace)
237  // Id of destination gpr (for trace)
238  uint32_t oldVgprId;
239  // Tick count of last old_vgpr copy
240  uint64_t oldVgprTcnt;
241 
242  // Old value of destination gpr (for trace)
244  // Id of destination gpr (for trace)
245  uint32_t oldDgprId;
246  // Tick count of last old_vgpr copy
247  uint64_t oldDgprTcnt;
248 
249  // Execution mask at wavefront start
251 
252  // number of barriers this WF has joined
255  // Flag to stall a wave on barrier
257 
258  // a pointer to the fraction of the LDS allocated
259  // to this workgroup (thus this wavefront)
261 
262  // A pointer to the spill area
264  // The size of the spill area
266  // The vector width of the spill area
267  uint32_t spillWidth;
268 
269  // A pointer to the private memory area
271  // The size of the private memory area
272  uint32_t privSizePerItem;
273 
274  // A pointer ot the read-only memory area
276  // size of the read-only memory area
277  uint32_t roSize;
278 
279  // pointer to buffer for storing kernel arguments
280  uint8_t *kernelArgs;
281  // unique WF id over all WFs executed across all CUs
282  uint64_t wfDynId;
283 
284  // number of times instruction issue for this wavefront is blocked
285  // due to VRF port availability
287  // number of times an instruction of a WF is blocked from being issued
288  // due to WAR and WAW dependencies
290  // number of times an instruction of a WF is blocked from being issued
291  // due to WAR and WAW dependencies
293  // distribution of executed instructions based on their register
294  // operands; this is used to highlight the load on the VRF
297 
298  // Functions to operate on call argument memory
299  // argument memory for hsail call instruction
301  void
302  initCallArgMem(int func_args_size_per_item, int wf_size)
303  {
304  callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
305  }
306 
307  template<typename CType>
308  CType
309  readCallArgMem(int lane, int addr)
310  {
311  return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
312  }
313 
314  template<typename CType>
315  void
316  writeCallArgMem(int lane, int addr, CType val)
317  {
318  callArgMem->setLaneAddr<CType>(lane, addr, val);
319  }
320 
321  typedef WavefrontParams Params;
322  Wavefront(const Params *p);
323  ~Wavefront();
324  virtual void init();
325 
326  void
328  {
329  computeUnit = cu;
330  }
331 
332  void start(uint64_t _wfDynId, uint64_t _base_ptr);
333  void exec();
334  void updateResources();
335  int ready(itype_e type);
337  void regStats();
339 
340  bool waitingAtBarrier(int lane);
341 
342  void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
343  const VectorMask& exec_mask);
344 
346 
347  uint32_t pc() const;
348 
349  uint32_t rpc() const;
350 
351  VectorMask execMask() const;
352 
353  bool execMask(int lane) const;
354 
355  void pc(uint32_t new_pc);
356 
357  void discardFetch();
358 
363  uint32_t getStaticContextSize() const;
364 
369  void getContext(const void *out);
370 
375  void setContext(const void *in);
376 
377  TheGpuISA::GPUISA&
379  {
380  return _gpuISA;
381  }
382 
383  private:
384  TheGpuISA::GPUISA _gpuISA;
393 };
394 
395 #endif // __WAVEFRONT_HH__
Addr roBase
Definition: wavefront.hh:275
std::vector< uint32_t > oldVgpr
Definition: wavefront.hh:236
uint32_t workGroupSz[3]
Definition: wavefront.hh:197
void discardFetch()
Definition: wavefront.cc:809
Addr spillBase
Definition: wavefront.hh:263
VectorMask getPred()
Definition: wavefront.hh:338
bool isOldestInstGMem()
Definition: wavefront.cc:212
uint32_t oldDgprId
Definition: wavefront.hh:245
Stats::Scalar numTimesBlockedDueRAWDependencies
Definition: wavefront.hh:292
CallArgMem(int func_args_size_per_item, int wf_size)
Definition: wavefront.hh:121
void setContext(const void *in)
Sets the hardware context fromt a stream of bytes This method is designed for HSAIL execution...
Definition: wavefront.cc:924
uint32_t barrierCnt
Definition: wavefront.hh:157
Stats::Scalar numTimesBlockedDueVrfPortAvail
Definition: wavefront.hh:286
std::deque< std::unique_ptr< ReconvergenceStackEntry > > reconvergenceStack
Stack containing Control Flow Graph nodes (i.e., kernel instructions) to be visited by the wavefront...
Definition: wavefront.hh:392
uint8_t * mem
Definition: wavefront.hh:109
int maxBarCnt
Definition: wavefront.hh:254
uint32_t gridSz[3]
Definition: wavefront.hh:198
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
ip6_addr_t addr
Definition: inet.hh:335
VectorMask initMask
Definition: wavefront.hh:250
uint32_t wgSz
Definition: wavefront.hh:200
uint32_t spillWidth
Definition: wavefront.hh:267
int simdId
Definition: wavefront.hh:165
bool dropFetch
Definition: wavefront.hh:172
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
Definition: wavefront.cc:783
uint32_t dispatchId
Definition: wavefront.hh:208
int kernId
Definition: wavefront.hh:163
class ConditionRegisterState * condRegState
Definition: wavefront.hh:175
bool isOldestInstFlatMem()
Definition: wavefront.cc:251
this represents a slice of the overall LDS, intended to be associated with an individual workgroup ...
Definition: lds_state.hh:58
Bitfield< 4, 0 > mode
Definition: miscregs.hh:1385
bool isOldestInstPrivMem()
Definition: wavefront.cc:238
int wfSlotId
Definition: wavefront.hh:162
bool stalledAtBarrier
Definition: wavefront.hh:256
uint32_t maxSpVgprs
Definition: wavefront.hh:177
LdsChunk * ldsChunk
Definition: wavefront.hh:260
Stats::Scalar numTimesBlockedDueWAXDependencies
Definition: wavefront.hh:289
uint32_t oldVgprId
Definition: wavefront.hh:238
This is a simple scalar statistic, like a counter.
Definition: statistics.hh:2475
uint64_t lastTrace
Definition: wavefront.hh:228
Bitfield< 63 > val
Definition: misc.hh:770
uint32_t workGroupId[3]
Definition: wavefront.hh:196
bool instructionBufferHasBranch()
Definition: wavefront.cc:266
uint64_t wfDynId
Definition: wavefront.hh:282
uint32_t barrierSlots
Definition: wavefront.hh:159
CallArgMem * callArgMem
Definition: wavefront.hh:300
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
Stats::Distribution srcRegOpDist
Definition: wavefront.hh:295
std::vector< uint32_t > workItemId[3]
Definition: wavefront.hh:193
uint32_t pc
PC of current instruction.
Definition: wavefront.hh:66
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:169
void initCallArgMem(int func_args_size_per_item, int wf_size)
Definition: wavefront.hh:302
uint32_t actualWgSz[3]
Definition: wavefront.hh:202
uint32_t wfId
Definition: wavefront.hh:206
uint32_t rdLmReqsInPipe
Definition: wavefront.hh:222
void regStats()
Register statistics for this object.
Definition: wavefront.cc:95
Addr privBase
Definition: wavefront.hh:270
std::vector< uint32_t > workItemFlatId
Definition: wavefront.hh:194
Wavefront(const Params *p)
Definition: wavefront.cc:51
void writeCallArgMem(int lane, int addr, CType val)
Definition: wavefront.hh:316
void updateResources()
Definition: wavefront.cc:542
uint32_t getStaticContextSize() const
Returns the size of the static hardware context of a particular wavefront This should be updated ever...
Definition: wavefront.cc:847
A simple distribution stat.
Definition: statistics.hh:2523
CType readCallArgMem(int lane, int addr)
Definition: wavefront.hh:309
std::vector< int > barCnt
Definition: wavefront.hh:253
void setParent(ComputeUnit *cu)
Definition: wavefront.hh:327
ComputeUnit * computeUnit
Definition: wavefront.hh:167
uint32_t wgId
Definition: wavefront.hh:199
Stats::Distribution dstRegOpDist
Definition: wavefront.hh:296
uint32_t rdGmReqsInPipe
Definition: wavefront.hh:223
bool isLmInstruction(GPUDynInstPtr ii)
Definition: wavefront.cc:174
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: wavefront.cc:135
uint32_t outstandingReqsWrLm
Definition: wavefront.hh:217
uint32_t actualWgSzTotal
Definition: wavefront.hh:203
void getContext(const void *out)
Returns the hardware context as a stream of bytes This method is designed for HSAIL execution...
Definition: wavefront.cc:857
uint32_t outstandingReqsRdGm
Definition: wavefront.hh:219
int ready(itype_e type)
Definition: wavefront.cc:305
int memTraceBusy
Definition: wavefront.hh:227
void computeActualWgSz(NDRange *ndr)
Definition: wavefront.cc:982
WavefrontParams Params
Definition: wavefront.hh:321
void exec()
Definition: wavefront.cc:642
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,16,32,64}_t.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142
bool isOldestInstALU()
Definition: wavefront.cc:184
uint32_t outstandingReqsRdLm
Definition: wavefront.hh:221
int getLaneOffset(int lane, int addr)
Definition: wavefront.hh:116
uint64_t basePtr
Definition: wavefront.hh:154
uint32_t outstandingReqs
Definition: wavefront.hh:210
uint32_t privSizePerItem
Definition: wavefront.hh:272
uint32_t pc() const
Definition: wavefront.cc:816
bool isOldestInstBarrier()
Definition: wavefront.cc:199
TheGpuISA::GPUISA & gpuISA()
Definition: wavefront.hh:378
uint32_t outstandingReqsWrGm
Definition: wavefront.hh:215
bool isGmInstruction(GPUDynInstPtr ii)
Definition: wavefront.cc:165
std::vector< Addr > lastAddr
Definition: wavefront.hh:192
TheGpuISA::GPUISA _gpuISA
Definition: wavefront.hh:384
type
Definition: misc.hh:728
int size()
Definition: pagetable.hh:146
uint32_t oldBarrierCnt
Definition: wavefront.hh:156
uint64_t oldDgprTcnt
Definition: wavefront.hh:247
bool pendingFetch
Definition: wavefront.hh:171
uint64_t oldVgprTcnt
Definition: wavefront.hh:240
uint32_t memReqsInPipe
Definition: wavefront.hh:213
bool isOldestInstLMem()
Definition: wavefront.cc:225
int reservedVectorRegs
Definition: wavefront.hh:230
uint32_t startVgprIndex
Definition: wavefront.hh:233
VectorMask execMask() const
Definition: wavefront.cc:828
A reconvergence stack entry conveys the necessary state to implement control flow divergence...
Definition: wavefront.hh:62
void setLaneAddr(int lane, int addr, CType val)
Definition: wavefront.hh:141
uint32_t roSize
Definition: wavefront.hh:277
uint32_t wrGmReqsInPipe
Definition: wavefront.hh:225
uint8_t * getLaneAddr(int lane, int addr)
Definition: wavefront.hh:134
uint32_t spillSizePerItem
Definition: wavefront.hh:265
static const int MAX_NUM_INSTS_PER_WF
Definition: wavefront.hh:56
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition: wavefront.cc:157
std::vector< uint64_t > oldDgpr
Definition: wavefront.hh:243
void popFromReconvergenceStack()
Definition: wavefront.cc:791
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Definition: wavefront.cc:142
uint32_t maxDynWaveId
Definition: wavefront.hh:207
uint32_t wrLmReqsInPipe
Definition: wavefront.hh:224
uint32_t maxDpVgprs
Definition: wavefront.hh:179
uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0)
Definition: wavefront.cc:282
uint32_t barrierId
Definition: wavefront.hh:158
Bitfield< 0 > p
Abstract superclass for simulation objects.
Definition: sim_object.hh:94
int funcArgsSizePerItem
Definition: wavefront.hh:112
status_e status
Definition: wavefront.hh:160
uint32_t rpc() const
Definition: wavefront.cc:822
uint32_t rpc
PC of the immediate post-dominator instruction, i.e., the value of pc for the first instruction that ...
Definition: wavefront.hh:72
bool waitingAtBarrier(int lane)
Definition: wavefront.cc:777
VectorMask execMask
Execution mask.
Definition: wavefront.hh:76
uint8_t * kernelArgs
Definition: wavefront.hh:280

Generated on Fri Jun 9 2017 13:03:48 for gem5 by doxygen 1.8.6