gem5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
dispatcher.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its contributors
18  * may be used to endorse or promote products derived from this software
19  * without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * Author: Brad Beckmann, Marc Orr
34  */
35 
36 
38 
39 #include "cpu/base.hh"
40 #include "debug/GPUDisp.hh"
41 #include "gpu-compute/cl_driver.hh"
42 #include "gpu-compute/cl_event.hh"
43 #include "gpu-compute/shader.hh"
44 #include "gpu-compute/wavefront.hh"
45 #include "mem/packet_access.hh"
46 
48 
50  : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
51  pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
52  dispatchCount(0), dispatchActive(false), cpu(p->cpu),
53  shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)
54 {
55  shader->handshake(this);
56  driver->handshake(this);
57 
58  ndRange.wg_disp_rem = false;
59  ndRange.globalWgId = 0;
60 
61  schedule(&tickEvent, 0);
62 
63  // translation port for the dispatcher
64  tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
65 
67  .name(name() + ".num_kernel_launched")
68  .desc("number of kernel launched")
69  ;
70 }
71 
72 GpuDispatcher *GpuDispatcherParams::create()
73 {
74  GpuDispatcher *dispatcher = new GpuDispatcher(this);
75  GpuDispatcher::setInstance(dispatcher);
76 
78 }
79 
80 void
82 {
83  Tick event_tick = 0;
84 
85  if (ndRange.wg_disp_rem)
86  fatal("Checkpointing not supported during active workgroup execution");
87 
88  if (tickEvent.scheduled())
89  event_tick = tickEvent.when();
90 
91  SERIALIZE_SCALAR(event_tick);
92 
93 }
94 
95 void
97 {
98  Tick event_tick;
99 
100  if (tickEvent.scheduled())
102 
103  UNSERIALIZE_SCALAR(event_tick);
104 
105  if (event_tick)
106  schedule(&tickEvent, event_tick);
107 }
108 
111 {
112  AddrRangeList ranges;
113 
114  DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
115  pioAddr, pioSize);
116 
117  ranges.push_back(RangeSize(pioAddr, pioSize));
118 
119  return ranges;
120 }
121 
122 Tick
124 {
125  assert(pkt->getAddr() >= pioAddr);
126  assert(pkt->getAddr() < pioAddr + pioSize);
127 
128  int offset = pkt->getAddr() - pioAddr;
129  pkt->allocate();
130 
131  DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
132 
133  if (offset < 8) {
134  assert(!offset);
135  assert(pkt->getSize() == 8);
136 
137  uint64_t retval = dispatchActive;
138  pkt->set(retval);
139  } else {
140  offset -= 8;
141  assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
142  char *curTaskPtr = (char*)&curTask;
143 
144  memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
145  }
146 
147  pkt->makeAtomicResponse();
148 
149  return pioDelay;
150 }
151 
152 Tick
154 {
155  assert(pkt->getAddr() >= pioAddr);
156  assert(pkt->getAddr() < pioAddr + pioSize);
157 
158  int offset = pkt->getAddr() - pioAddr;
159 
160 #if TRACING_ON
161  uint64_t data_val = 0;
162 
163  switch (pkt->getSize()) {
164  case 1:
165  data_val = pkt->get<uint8_t>();
166  break;
167  case 2:
168  data_val = pkt->get<uint16_t>();
169  break;
170  case 4:
171  data_val = pkt->get<uint32_t>();
172  break;
173  case 8:
174  data_val = pkt->get<uint64_t>();
175  break;
176  default:
177  DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
178  }
179 
180  DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
181  pkt->getSize());
182 #endif
183  if (!offset) {
184  static int nextId = 0;
185 
186  // The depends field of the qstruct, which was previously unused, is
187  // used to communicate with simulated application.
188  if (curTask.depends) {
189  HostState hs;
190  shader->ReadMem((uint64_t)(curTask.depends), &hs,
191  sizeof(HostState), 0);
192 
193  // update event start time (in nano-seconds)
194  uint64_t start = curTick() / 1000;
195 
196  shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
197  &start, sizeof(uint64_t), 0);
198  }
199 
200  // launch kernel
202 
203  NDRange *ndr = &(ndRangeMap[nextId]);
204  // copy dispatch info
205  ndr->q = curTask;
206 
207  // update the numDispTask polled by the runtime
208  accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
209 
210  ndr->numWgTotal = 1;
211 
212  for (int i = 0; i < 3; ++i) {
213  ndr->wgId[i] = 0;
214  ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
215  ndr->numWgTotal *= ndr->numWg[i];
216  }
217 
218  ndr->numWgCompleted = 0;
219  ndr->globalWgId = 0;
220  ndr->wg_disp_rem = true;
221  ndr->execDone = false;
222  ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
223  ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
224  ndr->dispatchId = nextId;
225  ndr->curCid = pkt->req->contextId();
226  DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
227  execIds.push(nextId);
228  ++nextId;
229 
230  dispatchActive = true;
231 
232  if (!tickEvent.scheduled()) {
234  }
235  } else {
236  // populate current task struct
237  // first 64 bits are launch reg
238  offset -= 8;
239  assert(offset < sizeof(HsaQueueEntry));
240  char *curTaskPtr = (char*)&curTask;
241  memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
242  }
243 
244  pkt->makeAtomicResponse();
245 
246  return pioDelay;
247 }
248 
249 
251 GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
252 {
253  if (if_name == "translation_port") {
254  return *tlbPort;
255  }
256 
257  return DmaDevice::getMasterPort(if_name, idx);
258 }
259 
260 void
262 {
263  int fail_count = 0;
264 
265  // There are potentially multiple outstanding kernel launches.
266  // It is possible that the workgroups in a different kernel
267  // can fit on the GPU even if another kernel's workgroups cannot
268  DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
269 
270  while (execIds.size() > fail_count) {
271  int execId = execIds.front();
272 
273  while (ndRangeMap[execId].wg_disp_rem) {
274  //update the thread context
275  shader->updateContext(ndRangeMap[execId].curCid);
276 
277  // attempt to dispatch_workgroup
278  if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
279  // if we failed try the next kernel,
280  // it may have smaller workgroups.
281  // put it on the queue to rety latter
282  DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
283  execIds.push(execId);
284  ++fail_count;
285  break;
286  }
287  }
288  // let's try the next kernel_id
289  execIds.pop();
290  }
291 
292  DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
293 
294  if (doneIds.size() && cpu) {
296  }
297 
298  while (doneIds.size()) {
299  // wakeup the CPU if any Kernels completed this cycle
300  DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
301  doneIds.pop();
302  }
303 }
304 
305 void
307 {
308  int kern_id = w->kernId;
309  DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
310  assert(ndRangeMap[kern_id].dispatchId == kern_id);
311  ndRangeMap[kern_id].numWgCompleted++;
312 
313  if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
314  ndRangeMap[kern_id].execDone = true;
315  doneIds.push(kern_id);
316 
317  if (ndRangeMap[kern_id].addrToNotify) {
318  accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
319  0);
320  }
321 
322  accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
323 
324  // update event end time (in nano-seconds)
325  if (ndRangeMap[kern_id].q.depends) {
326  HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
327  uint64_t event;
328  shader->ReadMem((uint64_t)(&host_state->event), &event,
329  sizeof(uint64_t), 0);
330 
331  uint64_t end = curTick() / 1000;
332 
333  shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
334  sizeof(uint64_t), 0);
335  }
336  }
337 
338  if (!tickEvent.scheduled()) {
340  }
341 }
342 
343 void
345 {
346  if (!tickEvent.scheduled())
348 }
349 
350 void
351 GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
352 {
353  if (cpu) {
354  if (off) {
355  shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
356  true);
357  val += off;
358  }
359 
360  shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
361  } else {
362  panic("Cannot find host");
363  }
364 }
365 
367  : Event(CPU_Tick_Pri), dispatcher(_dispatcher)
368 {
369 }
370 
371 void
373 {
374  dispatcher->exec();
375 }
376 
377 const char*
379 {
380  return "GPU Dispatcher tick";
381 }
382 
383 // helper functions for driver to retrieve GPU attributes
384 int
386 {
387  return shader->cuList.size();
388 }
389 
390 int
392 {
393  return shader->cuList[0]->wfSize();
394 }
395 
396 void
398 {
399  shader->funcargs_size = funcargs_size;
400 }
401 
402 uint32_t
404 {
405  return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
406 }
AddrRangeList getAddrRanges() const
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
Definition: dispatcher.cc:110
#define DPRINTF(x,...)
Definition: trace.hh:212
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:342
AddrRange RangeSize(Addr start, Addr size)
Definition: addr_range.hh:398
Tick ticks(int numCycles) const
Definition: shader.hh:91
void set(T v, ByteOrder endian)
Set the value in the data pointer to v using the specified endianness.
std::vector< ComputeUnit * > cuList
Definition: shader.hh:159
uint64_t event
Definition: qstruct.hh:104
BaseCPU * cpu
Definition: dispatcher.hh:93
const std::string & name()
Definition: trace.cc:49
Bitfield< 7 > i
Definition: miscregs.hh:1378
ContextID contextId() const
Accessor function for context ID.
Definition: request.hh:694
virtual void unserialize(CheckpointIn &cp)
Unserialize an object.
Definition: dispatcher.cc:96
#define panic(...)
Definition: misc.hh:153
std::queue< int > execIds
Definition: dispatcher.hh:85
void updateContext(int cid)
Definition: shader.cc:121
ip6_addr_t addr
Definition: inet.hh:335
int wfSize() const
Definition: dispatcher.cc:391
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:381
uint32_t getStaticContextSize() const
Returns the size of the static hardware context of a wavefront.
Definition: dispatcher.cc:403
void handshake(GpuDispatcher *_dispatcher)
Definition: cl_driver.cc:89
void setFuncargsSize(int funcargs_size)
Definition: dispatcher.cc:397
Bitfield< 23, 0 > offset
Definition: types.hh:149
void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
Definition: dispatcher.cc:351
int kernId
Definition: wavefront.hh:163
uint64_t addrToNotify
Definition: qstruct.hh:80
Tick write(PacketPtr pkt)
Pure virtual function that the device must implement.
Definition: dispatcher.cc:153
uint64_t depends
Definition: qstruct.hh:77
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:959
virtual void serialize(CheckpointOut &cp) const
Serialize an object.
Definition: dispatcher.cc:81
void deschedule(Event &event)
Definition: eventq.hh:734
T get(ByteOrder endian) const
Get the data in the packet byte swapped from the specified endianness.
Bitfield< 63 > val
Definition: misc.hh:770
bool execDone
Definition: ndrange.hh:62
int curCid
Definition: ndrange.hh:67
#define UNSERIALIZE_SCALAR(scalar)
Definition: serialize.hh:145
HsaQueueEntry curTask
Definition: dispatcher.hh:79
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:372
system
Definition: isa.cc:226
Tick curTick()
The current simulated tick.
Definition: core.hh:47
int numWgCompleted
Definition: ndrange.hh:55
int funcargs_size
Definition: shader.hh:143
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:161
void notifyWgCompl(Wavefront *w)
Definition: dispatcher.cc:306
ClDriver * driver
Definition: dispatcher.hh:95
virtual BaseMasterPort & getMasterPort(const std::string &if_name, PortID idx)
Get a master port with a given name and index.
Definition: dispatcher.cc:251
Tick when() const
Get the time that the event is scheduled.
Definition: eventq.hh:397
void makeAtomicResponse()
Definition: packet.hh:857
TickEvent(GpuDispatcher *)
Definition: dispatcher.cc:366
DmaDeviceParams Params
Definition: dma_device.hh:160
uint64_t Tick
Tick count type.
Definition: types.hh:63
Bitfield< 27 > q
Definition: miscregs.hh:1367
int numWg[3]
Definition: ndrange.hh:50
#define fatal(...)
Definition: misc.hh:163
const RequestPtr req
A pointer to the original request.
Definition: packet.hh:304
volatile uint32_t * numDispLeft
Definition: ndrange.hh:65
static void setInstance(GpuDispatcher *_instance)
Definition: dispatcher.hh:124
uint32_t wgSize[3]
Definition: qstruct.hh:59
uint32_t gdSize[3]
Definition: qstruct.hh:57
BaseMasterPort & getMasterPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a master port with a given name and index.
Definition: dma_device.cc:263
std::queue< int > doneIds
Definition: dispatcher.hh:87
Bitfield< 0 > w
Stats::Scalar num_kernelLaunched
Definition: dispatcher.hh:108
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:245
Bitfield< 10, 5 > event
static GpuDispatcher * getInstance()
Definition: dispatcher.hh:129
bool wg_disp_rem
Definition: ndrange.hh:60
Tick read(PacketPtr pkt)
Pure virtual function that the device must implement.
Definition: dispatcher.cc:123
#define SERIALIZE_SCALAR(scalar)
Definition: serialize.hh:143
uint32_t globalWgId
Definition: ndrange.hh:57
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:254
Shader * shader
Definition: dispatcher.hh:94
virtual const std::string name() const
Definition: sim_object.hh:117
int numWgTotal
Definition: ndrange.hh:52
std::ostream CheckpointOut
Definition: serialize.hh:67
void scheduleDispatch()
Definition: dispatcher.cc:344
Definition: eventq.hh:185
A BaseMasterPort is a protocol-agnostic master port, responsible only for the structural connection t...
Definition: port.hh:115
volatile bool * addrToNotify
Definition: ndrange.hh:64
T divCeil(const T &a, const U &b)
Definition: intmath.hh:198
bool dispatchActive
Definition: dispatcher.hh:91
void schedule(Event &event, Tick when)
Definition: eventq.hh:728
HsaQueueEntry q
Definition: ndrange.hh:45
TLBPort * tlbPort
Definition: dispatcher.hh:149
const char * description() const
Return a C string describing the event.
Definition: dispatcher.cc:378
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:287
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:181
TickEvent tickEvent
Definition: dispatcher.hh:96
std::unordered_map< int, NDRange > ndRangeMap
Definition: dispatcher.hh:81
uint64_t numDispLeft
Definition: qstruct.hh:82
static GpuDispatcher * instance
Definition: dispatcher.hh:98
unsigned getSize() const
Definition: packet.hh:649
NDRange ndRange
Definition: dispatcher.hh:82
bool dispatch_workgroups(NDRange *ndr)
Definition: shader.cc:171
Bitfield< 0 > p
GpuDispatcher(const Params *p)
Definition: dispatcher.cc:49
int wgId[3]
Definition: ndrange.hh:48
void hostWakeUp(BaseCPU *cpu)
Definition: shader.cc:129
void allocate()
Allocate memory for the packet.
Definition: packet.hh:1082
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:359
void handshake(GpuDispatcher *dispatcher)
Definition: shader.cc:220
Addr getAddr() const
Definition: packet.hh:639
int dispatchId
Definition: ndrange.hh:66

Generated on Fri Jun 9 2017 13:03:47 for gem5 by doxygen 1.8.6