~swilson/gem5-docs/shader_8cc_source.html

 /*

  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.

  * All rights reserved.

  *

  * For use for simulation and test purposes only

  *

  * Redistribution and use in source and binary forms, with or without

  * modification, are permitted provided that the following conditions are met:

  *

  * 1. Redistributions of source code must retain the above copyright notice,

  * this list of conditions and the following disclaimer.

  *

  * 2. Redistributions in binary form must reproduce the above copyright notice,

  * this list of conditions and the following disclaimer in the documentation

  * and/or other materials provided with the distribution.

  *

  * 3. Neither the name of the copyright holder nor the names of its contributors

  * may be used to endorse or promote products derived from this software

  * without specific prior written permission.

  *

  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

  * POSSIBILITY OF SUCH DAMAGE.

  *

  * Author: Steve Reinhardt

  */


 #include "gpu-compute/shader.hh"


 #include <limits>


 #include "arch/x86/linux/linux.hh"

 #include "base/chunk_generator.hh"

 #include "debug/GPUDisp.hh"

 #include "debug/GPUMem.hh"

 #include "debug/HSAIL.hh"

 #include "gpu-compute/dispatcher.hh"

 #include "gpu-compute/gpu_static_inst.hh"

 #include "gpu-compute/qstruct.hh"

 #include "gpu-compute/wavefront.hh"

 #include "mem/packet.hh"

 #include "mem/ruby/system/RubySystem.hh"

 #include "sim/sim_exit.hh"


 Shader::Shader(const Params *p) : ClockedObject(p),

     clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),

     cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),

     hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),

     separate_acquire_release(p->separate_acquire_release), coissue_return(1),

     trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),

     globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),

     box_tick_cnt(0), start_tick_cnt(0)

 {


     cuList.resize(n_cu);


     for (int i = 0; i < n_cu; ++i) {

         cuList[i] = p->CUs[i];

         assert(i == cuList[i]->cu_id);

         cuList[i]->shader = this;

     }

 }


 Addr

 Shader::mmap(int length)

 {


     Addr start;


     // round up length to the next page

     length = roundUp(length, TheISA::PageBytes);


     Process *proc = gpuTc->getProcessPtr();

     auto mem_state = proc->memState;


     if (proc->mmapGrowsDown()) {

         DPRINTF(HSAIL, "GROWS DOWN");

         start = mem_state->getMmapEnd() - length;

         mem_state->setMmapEnd(start);

     } else {

         DPRINTF(HSAIL, "GROWS UP");

         start = mem_state->getMmapEnd();

         mem_state->setMmapEnd(start + length);


         // assertion to make sure we don't overwrite the stack (it grows down)

         assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >

                mem_state->getMmapEnd());

     }


     DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);


     proc->allocateMem(start, length);


     return start;

 }


 void

 Shader::init()

 {

     // grab the threadContext of the thread running on the CPU

     assert(cpuPointer);

     gpuTc = cpuPointer->getContext(0);

     assert(gpuTc);

 }


 Shader::~Shader()

 {

     for (int j = 0; j < n_cu; ++j)

         delete cuList[j];

 }


 void

 Shader::updateContext(int cid) {

     // context of the thread which dispatched work

     assert(cpuPointer);

     gpuTc = cpuPointer->getContext(cid);

     assert(gpuTc);

 }


 void

 Shader::hostWakeUp(BaseCPU *cpu) {

     if (cpuPointer == cpu) {

         if (gpuTc->status() == ThreadContext::Suspended)

             cpu->activateContext(gpuTc->threadId());

     } else {

         //Make sure both dispatcher and shader are trying to

         //wakeup same host. Hack here to enable kernel launch

         //from multiple CPUs

         panic("Dispatcher wants to wakeup a different host");

     }

 }


 Shader*

 ShaderParams::create()

 {

     return new Shader(this);

 }


 void

 Shader::exec()

 {

     tick_cnt = curTick();

     box_tick_cnt = curTick() - start_tick_cnt;


     // apply any scheduled adds

     for (int i = 0; i < sa_n; ++i) {

         if (sa_when[i] <= tick_cnt) {

             *sa_val[i] += sa_x[i];

             sa_val.erase(sa_val.begin() + i);

             sa_x.erase(sa_x.begin() + i);

             sa_when.erase(sa_when.begin() + i);

             --sa_n;

             --i;

         }

     }


     // clock all of the cu's

     for (int i = 0; i < n_cu; ++i)

         cuList[i]->exec();

 }


 bool

 Shader::dispatch_workgroups(NDRange *ndr)

 {

     bool scheduledSomething = false;

     int cuCount = 0;

     int curCu = nextSchedCu;


     while (cuCount < n_cu) {

         //Every time we try a CU, update nextSchedCu

         nextSchedCu = (nextSchedCu + 1) % n_cu;


         // dispatch workgroup iff the following two conditions are met:

         // (a) wg_rem is true - there are unassigned workgroups in the grid

         // (b) there are enough free slots in cu cuList[i] for this wg

         if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {

             scheduledSomething = true;

             DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);


             // ticks() member function translates cycles to simulation ticks.

             if (!tickEvent.scheduled()) {

                 schedule(tickEvent, curTick() + this->ticks(1));

             }


             cuList[curCu]->StartWorkgroup(ndr);

             ndr->wgId[0]++;

             ndr->globalWgId++;

             if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {

                 ndr->wgId[0] = 0;

                 ndr->wgId[1]++;


                 if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {

                     ndr->wgId[1] = 0;

                     ndr->wgId[2]++;


                     if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {

                         ndr->wg_disp_rem = false;

                         break;

                     }

                 }

             }

         }


         ++cuCount;

         curCu = nextSchedCu;

     }


     return scheduledSomething;

 }


 void

 Shader::handshake(GpuDispatcher *_dispatcher)

 {

     dispatcher = _dispatcher;

 }


 void

 Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,

                            bool suppress_func_errors, int cu_id)

 {

     int block_size = cuList.at(cu_id)->cacheLineSize();

     unsigned size = req->getSize();


     Addr tmp_addr;

     BaseTLB::Mode trans_mode;


     if (cmd == MemCmd::ReadReq) {

         trans_mode = BaseTLB::Read;

     } else if (cmd == MemCmd::WriteReq) {

         trans_mode = BaseTLB::Write;

     } else {

         fatal("unexcepted MemCmd\n");

     }


     tmp_addr = req->getVaddr();

     Addr split_addr = roundDown(tmp_addr + size - 1, block_size);


     assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);


     // Misaligned access

     if (split_addr > tmp_addr) {

         RequestPtr req1, req2;

         req->splitOnVaddr(split_addr, req1, req2);


         PacketPtr pkt1 = new Packet(req2, cmd);

         PacketPtr pkt2 = new Packet(req1, cmd);


         functionalTLBAccess(pkt1, cu_id, trans_mode);

         functionalTLBAccess(pkt2, cu_id, trans_mode);


         PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);

         PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);


         new_pkt1->dataStatic(data);

         new_pkt2->dataStatic((uint8_t*)data + req1->getSize());


         if (suppress_func_errors) {

             new_pkt1->setSuppressFuncError();

             new_pkt2->setSuppressFuncError();

         }


         // fixme: this should be cuList[cu_id] if cu_id != n_cu

         // The latter requires a memPort in the dispatcher

         cuList[0]->memPort[0]->sendFunctional(new_pkt1);

         cuList[0]->memPort[0]->sendFunctional(new_pkt2);


         delete new_pkt1;

         delete new_pkt2;

         delete pkt1;

         delete pkt2;

     } else {

         PacketPtr pkt = new Packet(req, cmd);

         functionalTLBAccess(pkt, cu_id, trans_mode);

         PacketPtr new_pkt = new Packet(pkt->req, cmd);

         new_pkt->dataStatic(data);


         if (suppress_func_errors) {

             new_pkt->setSuppressFuncError();

         };


         // fixme: this should be cuList[cu_id] if cu_id != n_cu

         // The latter requires a memPort in the dispatcher

         cuList[0]->memPort[0]->sendFunctional(new_pkt);


         delete new_pkt;

         delete pkt;

     }

 }


 bool

 Shader::busy()

 {

     for (int i_cu = 0; i_cu < n_cu; ++i_cu) {

         if (!cuList[i_cu]->isDone()) {

             return true;

         }

     }


     return false;

 }


 void

 Shader::ScheduleAdd(uint32_t *val,Tick when,int x)

 {

     sa_val.push_back(val);

     sa_when.push_back(tick_cnt + when);

     sa_x.push_back(x);

     ++sa_n;

 }


 Shader::TickEvent::TickEvent(Shader *_shader)

     : Event(CPU_Tick_Pri), shader(_shader)

 {

 }


 void

 Shader::TickEvent::process()

 {

     if (shader->busy()) {

         shader->exec();

         shader->schedule(this, curTick() + shader->ticks(1));

     }

 }


 const char*

 Shader::TickEvent::description() const

 {

     return "Shader tick";

 }


 void

 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,

                   MemCmd cmd, bool suppress_func_errors)

 {

     uint8_t *data_buf = (uint8_t*)ptr;


     for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());

          !gen.done(); gen.next()) {

         Request *req = new Request(0, gen.addr(), gen.size(), 0,

                                    cuList[0]->masterId(), 0, 0, 0);


         doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);

         data_buf += gen.size();

         delete req;

     }

 }


 void

 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)

 {

     AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);

 }


 void

 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,

                 bool suppress_func_errors)

 {

     AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);

 }


 void

 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)

 {

     AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);

 }


 void

 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,

                  bool suppress_func_errors)

 {

     AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,

               suppress_func_errors);

 }


 /*

  * Send a packet through the appropriate TLB functional port.

  * If cu_id=n_cu, then this is the dispatcher's TLB.

  * Otherwise it's the TLB of the cu_id compute unit.

  */

 void

 Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)

 {

     // update senderState. Need to know the gpuTc and the TLB mode

     pkt->senderState =

         new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);


     if (cu_id == n_cu) {

         dispatcher->tlbPort->sendFunctional(pkt);

     } else {

         // even when the perLaneTLB flag is turned on

         // it's ok tp send all accesses through lane 0

         // since the lane # is not known here,

         // This isn't important since these are functional accesses.

         cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);

     }


     /* safe_cast the senderState */

     TheISA::GpuTLB::TranslationState *sender_state =

                safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);


     delete sender_state->tlbEntry;

     delete pkt->senderState;

 }

Shader::TickEvent::process
void process()
Definition: shader.cc:327

DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:212

Shader::AccessMem
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:342

Shader::sa_x
std::vector< int32_t > sa_x
Definition: shader.hh:156

Shader::ticks
Tick ticks(int numCycles) const
Definition: shader.hh:91

Shader::cuList
std::vector< ComputeUnit * > cuList
Definition: shader.hh:159

MemCmd
Definition: packet.hh:73

Packet::setSuppressFuncError
void setSuppressFuncError()
Definition: packet.hh:621

Shader::TickEvent::TickEvent
TickEvent(Shader *)
Definition: shader.cc:320

ArmISA::i
Bitfield< 7 > i
Definition: miscregs.hh:1378

panic
#define panic(...)
Definition: misc.hh:153

Shader::n_cu
int n_cu
Definition: shader.hh:129

BaseTLB::Write
Definition: tlb.hh:61

qstruct.hh

Shader::cpuPointer
BaseCPU * cpuPointer
Definition: shader.hh:100

Shader::updateContext
void updateContext(int cid)
Definition: shader.cc:121

Shader::doFunctionalAccess
void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition: shader.cc:226

GpuDispatcher
Definition: dispatcher.hh:53

Process::allocateMem
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition: process.cc:310

Process::mmapGrowsDown
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward, but a few (such as Alpha) grow upward instead, so they can override this method to return false.
Definition: process.hh:144

Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:381

Shader::dispatcher
GpuDispatcher * dispatcher
Definition: shader.hh:165

ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0

Shader::sa_when
std::vector< uint64_t > sa_when
Definition: shader.hh:154

Shader
Definition: shader.hh:76

MemCmd::WriteReq
Definition: packet.hh:87

ArmISA::mode
Bitfield< 4, 0 > mode
Definition: miscregs.hh:1385

Shader::tickEvent
TickEvent tickEvent
Definition: shader.hh:113

NDRange
Definition: ndrange.hh:42

roundUp
T roundUp(const T &val, const U &align)
Definition: intmath.hh:205

Process::memState
std::shared_ptr< MemState > memState
Definition: process.hh:206

Request
Definition: request.hh:87

Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:909

X86ISA::val
Bitfield< 63 > val
Definition: misc.hh:770

data
const char data[]
Definition: circlebuf.cc:43

Shader::WriteMem
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:372

Shader::exec
void exec()
Definition: shader.cc:148

curTick
Tick curTick()
The current simulated tick.
Definition: core.hh:47

BaseTLB::Read
Definition: tlb.hh:61

Shader::nextSchedCu
int nextSchedCu
Definition: shader.hh:146

Tick
uint64_t Tick
Tick count type.
Definition: types.hh:63

ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:234

ChunkGenerator
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Definition: chunk_generator.hh:57

fatal
#define fatal(...)
Definition: misc.hh:163

Packet::req
const RequestPtr req
A pointer to the original request.
Definition: packet.hh:304

Shader::TickEvent::description
const char * description() const
Return a C string describing the event.
Definition: shader.cc:336

Request::splitOnVaddr
void splitOnVaddr(Addr split_addr, RequestPtr &req1, RequestPtr &req2)
Generate two requests as if this request had been split into two pieces.
Definition: request.hh:497

RubySystem.hh

Shader::init
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: shader.cc:106

sim_exit.hh

roundDown
T roundDown(const T &val, const U &align)
Definition: intmath.hh:213

HsaQueueEntry::wgSize
uint32_t wgSize[3]
Definition: qstruct.hh:59

HsaQueueEntry::gdSize
uint32_t gdSize[3]
Definition: qstruct.hh:57

Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:99

Shader::functionalTLBAccess
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
Definition: shader.cc:391

Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142

safe_cast
T safe_cast(U ptr)
Definition: cast.hh:61

Shader::Shader
Shader(const Params *p)
Definition: shader.cc:53

Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:245

Shader::~Shader
~Shader()
Definition: shader.cc:114

AlphaISA::PageBytes
const Addr PageBytes
Definition: isa_traits.hh:52

NDRange::wg_disp_rem
bool wg_disp_rem
Definition: ndrange.hh:60

ArmISA::j
Bitfield< 24 > j
Definition: miscregs.hh:1369

NDRange::globalWgId
uint32_t globalWgId
Definition: ndrange.hh:57

BaseTLB::Mode
Mode
Definition: tlb.hh:61

Shader::mmap
Addr mmap(int length)
Definition: shader.cc:73

X86ISA::size
int size()
Definition: pagetable.hh:146

dispatcher.hh

Shader::ScheduleAdd
void ScheduleAdd(uint32_t *val, Tick when, int x)
Definition: shader.cc:312

gpu_static_inst.hh

packet.hh
Declaration of the Packet class.

MemCmd::ReadReq
Definition: packet.hh:84

ChunkGenerator::done
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
Definition: chunk_generator.hh:123

Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:454

Event
Definition: eventq.hh:185

Request::getVaddr
Addr getVaddr() const
Definition: request.hh:616

ThreadContext::threadId
virtual int threadId() const =0

wavefront.hh

Shader::sa_n
uint32_t sa_n
Definition: shader.hh:149

BaseCPU
Definition: cpu_dummy.hh:45

EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:728

ThreadContext::status
virtual Status status() const =0

Process
Definition: process.hh:63

NDRange::q
HsaQueueEntry q
Definition: ndrange.hh:45

GpuDispatcher::tlbPort
TLBPort * tlbPort
Definition: dispatcher.hh:149

ThreadContext::Suspended
Temporarily inactive.
Definition: thread_context.hh:112

length
uint8_t length
Definition: inet.hh:334

Shader::start_tick_cnt
uint64_t start_tick_cnt
Definition: shader.hh:163

chunk_generator.hh
Declaration and inline definition of ChunkGenerator object.

Shader::box_tick_cnt
uint64_t box_tick_cnt
Definition: shader.hh:162

Request::getSize
unsigned getSize() const
Definition: request.hh:552

Shader::dispatch_workgroups
bool dispatch_workgroups(NDRange *ndr)
Definition: shader.cc:171

Shader::tick_cnt
uint64_t tick_cnt
Definition: shader.hh:161

MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:325

NDRange::wgId
int wgId[3]
Definition: ndrange.hh:48

X86ISA::x
Bitfield< 1 > x
Definition: types.hh:105

Shader::hostWakeUp
void hostWakeUp(BaseCPU *cpu)
Definition: shader.cc:129

shader.hh

Shader::Params
ShaderParams Params
Definition: shader.hh:84

Shader::ReadMem
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:359

MasterPort::sendFunctional
void sendFunctional(PacketPtr pkt)
Send a functional request packet, where the data is instantly updated everywhere in the memory system...
Definition: port.cc:173

Shader::busy
bool busy()
Definition: shader.cc:300

Shader::handshake
void handshake(GpuDispatcher *dispatcher)
Definition: shader.cc:220

ProbePoints::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:102

linux.hh

Shader::sa_val
std::vector< uint32_t * > sa_val
Definition: shader.hh:152