gem5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
gpu_dyn_inst.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its contributors
18  * may be used to endorse or promote products derived from this software
19  * without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * Author: Anthony Gutierrez
34  */
35 
36 #ifndef __GPU_DYN_INST_HH__
37 #define __GPU_DYN_INST_HH__
38 
39 #include <cstdint>
40 #include <string>
41 
42 #include "enums/MemType.hh"
43 #include "enums/StorageClassType.hh"
46 
47 class GPUStaticInst;
48 
49 template<typename T>
51 {
52  public:
53  T a;
54 
55  AtomicOpAnd(T _a) : a(_a) { }
56  void execute(T *b) { *b &= a; }
57 };
58 
59 template<typename T>
61 {
62  public:
63  T a;
64  AtomicOpOr(T _a) : a(_a) { }
65  void execute(T *b) { *b |= a; }
66 };
67 
68 template<typename T>
70 {
71  public:
72  T a;
73  AtomicOpXor(T _a) : a(_a) {}
74  void execute(T *b) { *b ^= a; }
75 };
76 
77 template<typename T>
79 {
80  public:
81  T c;
82  T s;
83 
85 
86  AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
87  : c(_c), s(_s), computeUnit(compute_unit) { }
88 
89  void
90  execute(T *b)
91  {
93 
94  if (*b == c) {
95  *b = s;
96  } else {
98  }
99 
100  if (computeUnit->xact_cas_mode) {
101  computeUnit->xactCasLoadMap.clear();
102  }
103  }
104 };
105 
106 template<typename T>
108 {
109  public:
110  T a;
111  AtomicOpExch(T _a) : a(_a) { }
112  void execute(T *b) { *b = a; }
113 };
114 
115 template<typename T>
117 {
118  public:
119  T a;
120  AtomicOpAdd(T _a) : a(_a) { }
121  void execute(T *b) { *b += a; }
122 };
123 
124 template<typename T>
126 {
127  public:
128  T a;
129  AtomicOpSub(T _a) : a(_a) { }
130  void execute(T *b) { *b -= a; }
131 };
132 
133 template<typename T>
135 {
136  public:
138  void execute(T *b) { *b += 1; }
139 };
140 
141 template<typename T>
143 {
144  public:
146  void execute(T *b) { *b -= 1; }
147 };
148 
149 template<typename T>
151 {
152  public:
153  T a;
154  AtomicOpMax(T _a) : a(_a) { }
155 
156  void
157  execute(T *b)
158  {
159  if (a > *b)
160  *b = a;
161  }
162 };
163 
164 template<typename T>
166 {
167  public:
168  T a;
169  AtomicOpMin(T _a) : a(_a) {}
170 
171  void
172  execute(T *b)
173  {
174  if (a < *b)
175  *b = a;
176  }
177 };
178 
179 typedef enum
180 {
183 } vgpr_type;
184 
186 {
187  public:
188  GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
189  uint64_t instSeqNum);
190  ~GPUDynInst();
191  void execute(GPUDynInstPtr gpuDynInst);
192  int numSrcRegOperands();
193  int numDstRegOperands();
194  int getNumOperands();
195  bool isVectorRegister(int operandIdx);
196  bool isScalarRegister(int operandIdx);
197  bool isCondRegister(int operandIdx);
198  int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
199  int getOperandSize(int operandIdx);
200  bool isDstOperand(int operandIdx);
201  bool isSrcOperand(int operandIdx);
202 
203  const std::string &disassemble() const;
204 
205  uint64_t seqNum() const;
206 
207  Enums::StorageClassType executedAs();
208 
209  // The address of the memory operation
212 
213  // The data to get written
214  uint8_t *d_data;
215  // Additional data (for atomics)
216  uint8_t *a_data;
217  // Additional data (for atomics)
218  uint8_t *x_data;
219  // The execution mask
221 
222  // The memory type (M_U32, M_S32, ...)
223  Enums::MemType m_type;
224 
225  // The equivalency class
226  int equiv;
227  // The return VGPR type (VT_32 or VT_64)
229  // Number of VGPR's accessed (1, 2, or 4)
230  int n_reg;
231  // The return VGPR index
232  int dst_reg;
233  // There can be max 4 dest regs>
234  int dst_reg_vec[4];
235  // SIMD where the WF of the memory instruction has been mapped to
236  int simdId;
237  // unique id of the WF where the memory instruction belongs to
238  int wfDynId;
239  // The kernel id of the requesting wf
240  int kern_id;
241  // The CU id of the requesting wf
242  int cu_id;
243  // HW slot id where the WF is mapped to inside a SIMD unit
244  int wfSlotId;
245  // execution pipeline id where the memory instruction has been scheduled
246  int pipeId;
247  // The execution time of this operation
249  // The latency of this operation
251  // A list of bank conflicts for the 4 cycles.
252  uint32_t bc[4];
253 
254  // A pointer to ROM
255  uint8_t *rom;
256  // The size of the READONLY segment
257  int sz_rom;
258 
259  // Initiate the specified memory operation, by creating a
260  // memory request and sending it off to the memory system.
261  void initiateAcc(GPUDynInstPtr gpuDynInst);
262  // Complete the specified memory operation, by writing
263  // value back to the RF in the case of a load or atomic
264  // return or, in the case of a store, we do nothing
265  void completeAcc(GPUDynInstPtr gpuDynInst);
266 
267  void updateStats();
268 
270 
271  bool isALU() const;
272  bool isBranch() const;
273  bool isNop() const;
274  bool isReturn() const;
275  bool isUnconditionalJump() const;
276  bool isSpecialOp() const;
277  bool isWaitcnt() const;
278 
279  bool isBarrier() const;
280  bool isMemFence() const;
281  bool isMemRef() const;
282  bool isFlat() const;
283  bool isLoad() const;
284  bool isStore() const;
285 
286  bool isAtomic() const;
287  bool isAtomicNoRet() const;
288  bool isAtomicRet() const;
289 
290  bool isScalar() const;
291  bool readsSCC() const;
292  bool writesSCC() const;
293  bool readsVCC() const;
294  bool writesVCC() const;
295 
296  bool isAtomicAnd() const;
297  bool isAtomicOr() const;
298  bool isAtomicXor() const;
299  bool isAtomicCAS() const;
300  bool isAtomicExch() const;
301  bool isAtomicAdd() const;
302  bool isAtomicSub() const;
303  bool isAtomicInc() const;
304  bool isAtomicDec() const;
305  bool isAtomicMax() const;
306  bool isAtomicMin() const;
307 
308  bool isArgLoad() const;
309  bool isGlobalMem() const;
310  bool isLocalMem() const;
311 
312  bool isArgSeg() const;
313  bool isGlobalSeg() const;
314  bool isGroupSeg() const;
315  bool isKernArgSeg() const;
316  bool isPrivateSeg() const;
317  bool isReadOnlySeg() const;
318  bool isSpillSeg() const;
319 
320  bool isWorkitemScope() const;
321  bool isWavefrontScope() const;
322  bool isWorkgroupScope() const;
323  bool isDeviceScope() const;
324  bool isSystemScope() const;
325  bool isNoScope() const;
326 
327  bool isRelaxedOrder() const;
328  bool isAcquire() const;
329  bool isRelease() const;
330  bool isAcquireRelease() const;
331  bool isNoOrder() const;
332 
333  bool isGloballyCoherent() const;
334  bool isSystemCoherent() const;
335 
336  /*
337  * Loads/stores/atomics may have acquire/release semantics associated
338  * withthem. Some protocols want to see the acquire/release as separate
339  * requests from the load/store/atomic. We implement that separation
340  * using continuations (i.e., a function pointer with an object associated
341  * with it). When, for example, the front-end generates a store with
342  * release semantics, we will first issue a normal store and set the
343  * continuation in the GPUDynInst to a function that generate a
344  * release request. That continuation will be called when the normal
345  * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
346  * continuation will be called in the context of the same GPUDynInst
347  * that generated the initial store.
348  */
349  std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
350 
351  // when true, call execContinuation when response arrives
353 
354  template<typename c0> AtomicOpFunctor*
355  makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
356  {
357  if (isAtomicAnd()) {
358  return new AtomicOpAnd<c0>(*reg0);
359  } else if (isAtomicOr()) {
360  return new AtomicOpOr<c0>(*reg0);
361  } else if (isAtomicXor()) {
362  return new AtomicOpXor<c0>(*reg0);
363  } else if (isAtomicCAS()) {
364  return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
365  } else if (isAtomicExch()) {
366  return new AtomicOpExch<c0>(*reg0);
367  } else if (isAtomicAdd()) {
368  return new AtomicOpAdd<c0>(*reg0);
369  } else if (isAtomicSub()) {
370  return new AtomicOpSub<c0>(*reg0);
371  } else if (isAtomicInc()) {
372  return new AtomicOpInc<c0>();
373  } else if (isAtomicDec()) {
374  return new AtomicOpDec<c0>();
375  } else if (isAtomicMax()) {
376  return new AtomicOpMax<c0>(*reg0);
377  } else if (isAtomicMin()) {
378  return new AtomicOpMin<c0>(*reg0);
379  } else {
380  fatal("Unrecognized atomic operation");
381  }
382  }
383 
384  void
385  setRequestFlags(Request *req, bool setMemOrder=true)
386  {
387  // currently these are the easy scopes to deduce
388  if (isPrivateSeg()) {
390  } else if (isSpillSeg()) {
392  } else if (isGlobalSeg()) {
394  } else if (isReadOnlySeg()) {
396  } else if (isGroupSeg()) {
398  } else if (isFlat()) {
399  // TODO: translate to correct scope
400  assert(false);
401  } else {
402  fatal("%s has bad segment type\n", disassemble());
403  }
404 
405  if (isWavefrontScope()) {
408  } else if (isWorkgroupScope()) {
411  } else if (isDeviceScope()) {
414  } else if (isSystemScope()) {
417  } else if (!isNoScope() && !isWorkitemScope()) {
418  fatal("%s has bad scope type\n", disassemble());
419  }
420 
421  if (setMemOrder) {
422  // set acquire and release flags
423  if (isAcquire()) {
425  } else if (isRelease()) {
427  } else if (isAcquireRelease()) {
429  } else if (!isNoOrder()) {
430  fatal("%s has bad memory order\n", disassemble());
431  }
432  }
433 
434  // set atomic type
435  // currently, the instruction genenerator only produces atomic return
436  // but a magic instruction can produce atomic no return
437  if (isAtomicRet()) {
439  } else if (isAtomicNoRet()) {
441  }
442  }
443 
444  // Map returned packets and the addresses they satisfy with which lane they
445  // were requested from
446  typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
448 
449  // Track the status of memory requests per lane, a bit per lane
451  // for ld_v# or st_v#
454 
455  private:
457  uint64_t _seqNum;
458 };
459 
460 #endif // __GPU_DYN_INST_HH__
bool isRelaxedOrder() const
AtomicOpAdd(T _a)
bool isPrivateSeg() const
bool isSpecialOp() const
bool isUnconditionalJump() const
StatusVector memStatusVector
std::vector< Addr > addr
void execute(T *b)
void setMemSpaceConfigFlags(MemSpaceConfigFlags extraFlags)
Definition: request.hh:602
bool isSystemCoherent() const
std::vector< int > tlbHitLevel
void setRequestFlags(Request *req, bool setMemOrder=true)
AtomicOpOr(T _a)
Definition: gpu_dyn_inst.hh:64
The request is an atomic that returns data.
Definition: request.hh:164
std::map< unsigned, waveQueue > xactCasLoadMap
bool isAtomicAnd() const
uint64_t seqNum() const
WaitClass latency
AtomicOpFunctor * makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
bool isAtomicInc() const
int dst_reg_vec[4]
bool isScalar() const
int numDstRegOperands()
Definition: gpu_dyn_inst.cc:82
std::function< void(GPUStaticInst *, GPUDynInstPtr)> execContinuation
bool isAtomicSub() const
void execute(T *b)
bool isReturn() const
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
bool isBranch() const
Spill Segment.
Definition: request.hh:238
AtomicOpSub(T _a)
bool isAtomicOr() const
uint8_t * rom
void execute(T *b)
Definition: gpu_dyn_inst.hh:90
Access has Device (e.g., GPU) scope visibility.
Definition: request.hh:223
bool isWorkitemScope() const
int getOperandSize(int operandIdx)
The request should be marked with ACQUIRE.
Definition: request.hh:159
ComputeUnit * computeUnit
Definition: gpu_dyn_inst.hh:84
const std::string & disassemble() const
bool isLocalMem() const
uint8_t * a_data
bool isAtomicExch() const
std::vector< int > statusVector
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum)
Definition: gpu_dyn_inst.cc:43
bool isAtomicDec() const
bool isArgSeg() const
ComputeUnit * cu
Enums::StorageClassType executedAs()
Access has Workgroup scope visibility.
Definition: request.hh:221
AtomicOpExch(T _a)
bool isKernArgSeg() const
bool isAtomicAdd() const
bool isWorkgroupScope() const
void execute(T *b)
Global Segment.
Definition: request.hh:228
Bitfield< 7 > b
Definition: miscregs.hh:1564
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
Definition: gpu_dyn_inst.hh:86
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
void execute(T *b)
bool isGlobalMem() const
bool isBarrier() const
vgpr_type v_type
void updateStats()
bool isAtomicMax() const
void execute(T *b)
Definition: gpu_dyn_inst.hh:56
void execute(T *b)
uint64_t _seqNum
bool isCondRegister(int operandIdx)
uint8_t * d_data
Private Segment.
Definition: request.hh:232
bool isGroupSeg() const
AtomicOpAnd(T _a)
Definition: gpu_dyn_inst.hh:55
uint64_t Tick
Tick count type.
Definition: types.hh:63
bool isAtomicNoRet() const
bool isWavefrontScope() const
void execute(T *b)
Definition: gpu_dyn_inst.hh:74
bool isVectorRegister(int operandIdx)
Definition: gpu_dyn_inst.cc:94
Stats::Scalar numCASOps
bool isArgLoad() const
Readonly Segment.
Definition: request.hh:236
#define fatal(...)
Definition: misc.hh:163
bool isAtomicXor() const
uint8_t * x_data
The request is an atomic that does not return data.
Definition: request.hh:166
int numSrcRegOperands()
Definition: gpu_dyn_inst.cc:76
AtomicOpMax(T _a)
int getNumOperands()
Definition: gpu_dyn_inst.cc:88
VectorMask exec_mask
bool isNoScope() const
Has a synchronization scope been set?
Definition: request.hh:217
bool isAtomic() const
void execute(T *b)
bool isFlat() const
GPUStaticInst * staticInstruction()
bool isAtomicCAS() const
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142
AtomicOpMin(T _a)
bool isScalarRegister(int operandIdx)
bool isSrcOperand(int operandIdx)
bool isALU() const
accessor methods for the attributes of the underlying GPU static instruction
bool useContinuation
void execute(T *b)
Definition: gpu_dyn_inst.hh:65
bool xact_cas_mode
bool writesSCC() const
bool isGloballyCoherent() const
bool isStore() const
void completeAcc(GPUDynInstPtr gpuDynInst)
bool isNoOrder() const
bool readsSCC() const
bool isNop() const
GPUStaticInst * _staticInst
int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst)
The request should be marked with RELEASE.
Definition: request.hh:161
bool isRelease() const
bool isAtomicMin() const
bool readsVCC() const
bool isWaitcnt() const
void execute(GPUDynInstPtr gpuDynInst)
Definition: gpu_dyn_inst.cc:70
Stats::Scalar numFailedCASOps
bool isLoad() const
VectorMask statusBitVector
Access has System (e.g., CPU + GPU) scope visibility.
Definition: request.hh:225
bool writesVCC() const
bool isAtomicRet() const
std::unordered_map< Addr, std::vector< int > > StatusVector
AtomicOpXor(T _a)
Definition: gpu_dyn_inst.hh:73
bool isDeviceScope() const
uint32_t bc[4]
bool isAcquire() const
vgpr_type
bool isDstOperand(int operandIdx)
bool isSpillSeg() const
Access has Wavefront scope visibility.
Definition: request.hh:219
void execute(T *b)
bool isGlobalSeg() const
bool isSystemScope() const
Enums::MemType m_type
void setFlags(Flags flags)
Note that unlike other accessors, this function sets specific flags (ORs them in); it does not assign...
Definition: request.hh:595
void initiateAcc(GPUDynInstPtr gpuDynInst)
bool isReadOnlySeg() const
bool isMemRef() const
Group Segment.
Definition: request.hh:230
bool isMemFence() const
bool isAcquireRelease() const

Generated on Fri Jun 9 2017 13:03:47 for gem5 by doxygen 1.8.6