gem5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
shader.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its contributors
18  * may be used to endorse or promote products derived from this software
19  * without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * Author: Steve Reinhardt
34  */
35 
36 #include "gpu-compute/shader.hh"
37 
38 #include <limits>
39 
40 #include "arch/x86/linux/linux.hh"
41 #include "base/chunk_generator.hh"
42 #include "debug/GPUDisp.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/HSAIL.hh"
47 #include "gpu-compute/qstruct.hh"
48 #include "gpu-compute/wavefront.hh"
49 #include "mem/packet.hh"
51 #include "sim/sim_exit.hh"
52 
54  clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
55  cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
56  hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
57  separate_acquire_release(p->separate_acquire_release), coissue_return(1),
58  trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
59  globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
60  box_tick_cnt(0), start_tick_cnt(0)
61 {
62 
63  cuList.resize(n_cu);
64 
65  for (int i = 0; i < n_cu; ++i) {
66  cuList[i] = p->CUs[i];
67  assert(i == cuList[i]->cu_id);
68  cuList[i]->shader = this;
69  }
70 }
71 
72 Addr
74 {
75 
76  Addr start;
77 
78  // round up length to the next page
79  length = roundUp(length, TheISA::PageBytes);
80 
81  Process *proc = gpuTc->getProcessPtr();
82  auto mem_state = proc->memState;
83 
84  if (proc->mmapGrowsDown()) {
85  DPRINTF(HSAIL, "GROWS DOWN");
86  start = mem_state->getMmapEnd() - length;
87  mem_state->setMmapEnd(start);
88  } else {
89  DPRINTF(HSAIL, "GROWS UP");
90  start = mem_state->getMmapEnd();
91  mem_state->setMmapEnd(start + length);
92 
93  // assertion to make sure we don't overwrite the stack (it grows down)
94  assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
95  mem_state->getMmapEnd());
96  }
97 
98  DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
99 
100  proc->allocateMem(start, length);
101 
102  return start;
103 }
104 
105 void
107 {
108  // grab the threadContext of the thread running on the CPU
109  assert(cpuPointer);
110  gpuTc = cpuPointer->getContext(0);
111  assert(gpuTc);
112 }
113 
115 {
116  for (int j = 0; j < n_cu; ++j)
117  delete cuList[j];
118 }
119 
120 void
122  // context of the thread which dispatched work
123  assert(cpuPointer);
124  gpuTc = cpuPointer->getContext(cid);
125  assert(gpuTc);
126 }
127 
128 void
130  if (cpuPointer == cpu) {
132  cpu->activateContext(gpuTc->threadId());
133  } else {
134  //Make sure both dispatcher and shader are trying to
135  //wakeup same host. Hack here to enable kernel launch
136  //from multiple CPUs
137  panic("Dispatcher wants to wakeup a different host");
138  }
139 }
140 
141 Shader*
142 ShaderParams::create()
143 {
144  return new Shader(this);
145 }
146 
147 void
149 {
150  tick_cnt = curTick();
152 
153  // apply any scheduled adds
154  for (int i = 0; i < sa_n; ++i) {
155  if (sa_when[i] <= tick_cnt) {
156  *sa_val[i] += sa_x[i];
157  sa_val.erase(sa_val.begin() + i);
158  sa_x.erase(sa_x.begin() + i);
159  sa_when.erase(sa_when.begin() + i);
160  --sa_n;
161  --i;
162  }
163  }
164 
165  // clock all of the cu's
166  for (int i = 0; i < n_cu; ++i)
167  cuList[i]->exec();
168 }
169 
170 bool
172 {
173  bool scheduledSomething = false;
174  int cuCount = 0;
175  int curCu = nextSchedCu;
176 
177  while (cuCount < n_cu) {
178  //Every time we try a CU, update nextSchedCu
179  nextSchedCu = (nextSchedCu + 1) % n_cu;
180 
181  // dispatch workgroup iff the following two conditions are met:
182  // (a) wg_rem is true - there are unassigned workgroups in the grid
183  // (b) there are enough free slots in cu cuList[i] for this wg
184  if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
185  scheduledSomething = true;
186  DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
187 
188  // ticks() member function translates cycles to simulation ticks.
189  if (!tickEvent.scheduled()) {
190  schedule(tickEvent, curTick() + this->ticks(1));
191  }
192 
193  cuList[curCu]->StartWorkgroup(ndr);
194  ndr->wgId[0]++;
195  ndr->globalWgId++;
196  if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
197  ndr->wgId[0] = 0;
198  ndr->wgId[1]++;
199 
200  if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
201  ndr->wgId[1] = 0;
202  ndr->wgId[2]++;
203 
204  if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
205  ndr->wg_disp_rem = false;
206  break;
207  }
208  }
209  }
210  }
211 
212  ++cuCount;
213  curCu = nextSchedCu;
214  }
215 
216  return scheduledSomething;
217 }
218 
219 void
221 {
222  dispatcher = _dispatcher;
223 }
224 
225 void
227  bool suppress_func_errors, int cu_id)
228 {
229  int block_size = cuList.at(cu_id)->cacheLineSize();
230  unsigned size = req->getSize();
231 
232  Addr tmp_addr;
233  BaseTLB::Mode trans_mode;
234 
235  if (cmd == MemCmd::ReadReq) {
236  trans_mode = BaseTLB::Read;
237  } else if (cmd == MemCmd::WriteReq) {
238  trans_mode = BaseTLB::Write;
239  } else {
240  fatal("unexcepted MemCmd\n");
241  }
242 
243  tmp_addr = req->getVaddr();
244  Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
245 
246  assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
247 
248  // Misaligned access
249  if (split_addr > tmp_addr) {
250  RequestPtr req1, req2;
251  req->splitOnVaddr(split_addr, req1, req2);
252 
253 
254  PacketPtr pkt1 = new Packet(req2, cmd);
255  PacketPtr pkt2 = new Packet(req1, cmd);
256 
257  functionalTLBAccess(pkt1, cu_id, trans_mode);
258  functionalTLBAccess(pkt2, cu_id, trans_mode);
259 
260  PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
261  PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
262 
263  new_pkt1->dataStatic(data);
264  new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
265 
266  if (suppress_func_errors) {
267  new_pkt1->setSuppressFuncError();
268  new_pkt2->setSuppressFuncError();
269  }
270 
271  // fixme: this should be cuList[cu_id] if cu_id != n_cu
272  // The latter requires a memPort in the dispatcher
273  cuList[0]->memPort[0]->sendFunctional(new_pkt1);
274  cuList[0]->memPort[0]->sendFunctional(new_pkt2);
275 
276  delete new_pkt1;
277  delete new_pkt2;
278  delete pkt1;
279  delete pkt2;
280  } else {
281  PacketPtr pkt = new Packet(req, cmd);
282  functionalTLBAccess(pkt, cu_id, trans_mode);
283  PacketPtr new_pkt = new Packet(pkt->req, cmd);
284  new_pkt->dataStatic(data);
285 
286  if (suppress_func_errors) {
287  new_pkt->setSuppressFuncError();
288  };
289 
290  // fixme: this should be cuList[cu_id] if cu_id != n_cu
291  // The latter requires a memPort in the dispatcher
292  cuList[0]->memPort[0]->sendFunctional(new_pkt);
293 
294  delete new_pkt;
295  delete pkt;
296  }
297 }
298 
299 bool
301 {
302  for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
303  if (!cuList[i_cu]->isDone()) {
304  return true;
305  }
306  }
307 
308  return false;
309 }
310 
311 void
312 Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
313 {
314  sa_val.push_back(val);
315  sa_when.push_back(tick_cnt + when);
316  sa_x.push_back(x);
317  ++sa_n;
318 }
319 
321  : Event(CPU_Tick_Pri), shader(_shader)
322 {
323 }
324 
325 
326 void
328 {
329  if (shader->busy()) {
330  shader->exec();
331  shader->schedule(this, curTick() + shader->ticks(1));
332  }
333 }
334 
335 const char*
337 {
338  return "Shader tick";
339 }
340 
341 void
342 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
343  MemCmd cmd, bool suppress_func_errors)
344 {
345  uint8_t *data_buf = (uint8_t*)ptr;
346 
347  for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
348  !gen.done(); gen.next()) {
349  Request *req = new Request(0, gen.addr(), gen.size(), 0,
350  cuList[0]->masterId(), 0, 0, 0);
351 
352  doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
353  data_buf += gen.size();
354  delete req;
355  }
356 }
357 
358 void
359 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
360 {
361  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
362 }
363 
364 void
365 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
366  bool suppress_func_errors)
367 {
368  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
369 }
370 
371 void
372 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
373 {
374  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
375 }
376 
377 void
378 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
379  bool suppress_func_errors)
380 {
381  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
382  suppress_func_errors);
383 }
384 
385 /*
386  * Send a packet through the appropriate TLB functional port.
387  * If cu_id=n_cu, then this is the dispatcher's TLB.
388  * Otherwise it's the TLB of the cu_id compute unit.
389  */
390 void
392 {
393  // update senderState. Need to know the gpuTc and the TLB mode
394  pkt->senderState =
395  new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
396 
397  if (cu_id == n_cu) {
399  } else {
400  // even when the perLaneTLB flag is turned on
401  // it's ok tp send all accesses through lane 0
402  // since the lane # is not known here,
403  // This isn't important since these are functional accesses.
404  cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
405  }
406 
407  /* safe_cast the senderState */
408  TheISA::GpuTLB::TranslationState *sender_state =
409  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
410 
411  delete sender_state->tlbEntry;
412  delete pkt->senderState;
413 }
void process()
Definition: shader.cc:327
#define DPRINTF(x,...)
Definition: trace.hh:212
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:342
std::vector< int32_t > sa_x
Definition: shader.hh:156
Tick ticks(int numCycles) const
Definition: shader.hh:91
std::vector< ComputeUnit * > cuList
Definition: shader.hh:159
Definition: packet.hh:73
void setSuppressFuncError()
Definition: packet.hh:621
TickEvent(Shader *)
Definition: shader.cc:320
Bitfield< 7 > i
Definition: miscregs.hh:1378
#define panic(...)
Definition: misc.hh:153
int n_cu
Definition: shader.hh:129
BaseCPU * cpuPointer
Definition: shader.hh:100
void updateContext(int cid)
Definition: shader.cc:121
void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition: shader.cc:226
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition: process.cc:310
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward, but a few (such as Alpha) grow upward instead, so they can override this method to return false.
Definition: process.hh:144
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:381
GpuDispatcher * dispatcher
Definition: shader.hh:165
virtual Process * getProcessPtr()=0
std::vector< uint64_t > sa_when
Definition: shader.hh:154
Definition: shader.hh:76
Bitfield< 4, 0 > mode
Definition: miscregs.hh:1385
TickEvent tickEvent
Definition: shader.hh:113
T roundUp(const T &val, const U &align)
Definition: intmath.hh:205
std::shared_ptr< MemState > memState
Definition: process.hh:206
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:909
Bitfield< 63 > val
Definition: misc.hh:770
const char data[]
Definition: circlebuf.cc:43
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:372
void exec()
Definition: shader.cc:148
Tick curTick()
The current simulated tick.
Definition: core.hh:47
int nextSchedCu
Definition: shader.hh:146
uint64_t Tick
Tick count type.
Definition: types.hh:63
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
#define fatal(...)
Definition: misc.hh:163
const RequestPtr req
A pointer to the original request.
Definition: packet.hh:304
const char * description() const
Return a C string describing the event.
Definition: shader.cc:336
void splitOnVaddr(Addr split_addr, RequestPtr &req1, RequestPtr &req2)
Generate two requests as if this request had been split into two pieces.
Definition: request.hh:497
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: shader.cc:106
T roundDown(const T &val, const U &align)
Definition: intmath.hh:213
uint32_t wgSize[3]
Definition: qstruct.hh:59
uint32_t gdSize[3]
Definition: qstruct.hh:57
ThreadContext * gpuTc
Definition: shader.hh:99
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
Definition: shader.cc:391
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142
T safe_cast(U ptr)
Definition: cast.hh:61
Shader(const Params *p)
Definition: shader.cc:53
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:245
~Shader()
Definition: shader.cc:114
const Addr PageBytes
Definition: isa_traits.hh:52
bool wg_disp_rem
Definition: ndrange.hh:60
Bitfield< 24 > j
Definition: miscregs.hh:1369
uint32_t globalWgId
Definition: ndrange.hh:57
Mode
Definition: tlb.hh:61
Addr mmap(int length)
Definition: shader.cc:73
int size()
Definition: pagetable.hh:146
void ScheduleAdd(uint32_t *val, Tick when, int x)
Definition: shader.cc:312
Declaration of the Packet class.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
SenderState * senderState
This packet's sender state.
Definition: packet.hh:454
Definition: eventq.hh:185
Addr getVaddr() const
Definition: request.hh:616
virtual int threadId() const =0
uint32_t sa_n
Definition: shader.hh:149
void schedule(Event &event, Tick when)
Definition: eventq.hh:728
virtual Status status() const =0
HsaQueueEntry q
Definition: ndrange.hh:45
TLBPort * tlbPort
Definition: dispatcher.hh:149
Temporarily inactive.
uint8_t length
Definition: inet.hh:334
uint64_t start_tick_cnt
Definition: shader.hh:163
Declaration and inline definition of ChunkGenerator object.
uint64_t box_tick_cnt
Definition: shader.hh:162
unsigned getSize() const
Definition: request.hh:552
bool dispatch_workgroups(NDRange *ndr)
Definition: shader.cc:171
uint64_t tick_cnt
Definition: shader.hh:161
Bitfield< 0 > p
int wgId[3]
Definition: ndrange.hh:48
Bitfield< 1 > x
Definition: types.hh:105
void hostWakeUp(BaseCPU *cpu)
Definition: shader.cc:129
ShaderParams Params
Definition: shader.hh:84
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:359
void sendFunctional(PacketPtr pkt)
Send a functional request packet, where the data is instantly updated everywhere in the memory system...
Definition: port.cc:173
bool busy()
Definition: shader.cc:300
void handshake(GpuDispatcher *dispatcher)
Definition: shader.cc:220
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:102
std::vector< uint32_t * > sa_val
Definition: shader.hh:152

Generated on Fri Jun 9 2017 13:03:48 for gem5 by doxygen 1.8.6