~swilson/gem5-docs/GPUCoalescer_8cc_source.html

 /*

  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.

  * All rights reserved.

  *

  * For use for simulation and test purposes only

  *

  * Redistribution and use in source and binary forms, with or without

  * modification, are permitted provided that the following conditions are met:

  *

  * 1. Redistributions of source code must retain the above copyright notice,

  * this list of conditions and the following disclaimer.

  *

  * 2. Redistributions in binary form must reproduce the above copyright notice,

  * this list of conditions and the following disclaimer in the documentation

  * and/or other materials provided with the distribution.

  *

  * 3. Neither the name of the copyright holder nor the names of its contributors

  * may be used to endorse or promote products derived from this software

  * without specific prior written permission.

  *

  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

  * POSSIBILITY OF SUCH DAMAGE.

  *

  * Author: Sooraj Puthoor

  */


 #include "base/misc.hh"

 #include "base/str.hh"

 #include "config/the_isa.hh"


 #if THE_ISA == X86_ISA

 #include "arch/x86/insts/microldstop.hh"


 #endif // X86_ISA

 #include "mem/ruby/system/GPUCoalescer.hh"


 #include "cpu/testers/rubytest/RubyTester.hh"

 #include "debug/GPUCoalescer.hh"

 #include "debug/MemoryAccess.hh"

 #include "debug/ProtocolTrace.hh"

 #include "debug/RubyPort.hh"

 #include "debug/RubyStats.hh"

 #include "gpu-compute/shader.hh"

 #include "mem/packet.hh"

 #include "mem/ruby/common/DataBlock.hh"

 #include "mem/ruby/common/SubBlock.hh"

 #include "mem/ruby/network/MessageBuffer.hh"

 #include "mem/ruby/profiler/Profiler.hh"

 #include "mem/ruby/slicc_interface/AbstractController.hh"

 #include "mem/ruby/slicc_interface/RubyRequest.hh"

 #include "mem/ruby/structures/CacheMemory.hh"

 #include "mem/ruby/system/RubySystem.hh"

 #include "params/RubyGPUCoalescer.hh"


 using namespace std;


 GPUCoalescer *

 RubyGPUCoalescerParams::create()

 {

     return new GPUCoalescer(this);

 }


 HSAScope

 reqScopeToHSAScope(Request* req)

 {

     HSAScope accessScope = HSAScope_UNSPECIFIED;

     if (req->isScoped()) {

         if (req->isWavefrontScope()) {

             accessScope = HSAScope_WAVEFRONT;

         } else if (req->isWorkgroupScope()) {

             accessScope = HSAScope_WORKGROUP;

         } else if (req->isDeviceScope()) {

             accessScope = HSAScope_DEVICE;

         } else if (req->isSystemScope()) {

             accessScope = HSAScope_SYSTEM;

         } else {

             fatal("Bad scope type");

         }

     }

     return accessScope;

 }


 HSASegment

 reqSegmentToHSASegment(Request* req)

 {

     HSASegment accessSegment = HSASegment_GLOBAL;


     if (req->isGlobalSegment()) {

         accessSegment = HSASegment_GLOBAL;

     } else if (req->isGroupSegment()) {

         accessSegment = HSASegment_GROUP;

     } else if (req->isPrivateSegment()) {

         accessSegment = HSASegment_PRIVATE;

     } else if (req->isKernargSegment()) {

         accessSegment = HSASegment_KERNARG;

     } else if (req->isReadonlySegment()) {

         accessSegment = HSASegment_READONLY;

     } else if (req->isSpillSegment()) {

         accessSegment = HSASegment_SPILL;

     } else if (req->isArgSegment()) {

         accessSegment = HSASegment_ARG;

     } else {

         fatal("Bad segment type");

     }


     return accessSegment;

 }


 GPUCoalescer::GPUCoalescer(const Params *p)

     : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)

 {

     m_store_waiting_on_load_cycles = 0;

     m_store_waiting_on_store_cycles = 0;

     m_load_waiting_on_store_cycles = 0;

     m_load_waiting_on_load_cycles = 0;


     m_outstanding_count = 0;


     m_max_outstanding_requests = 0;

     m_deadlock_threshold = 0;

     m_instCache_ptr = nullptr;

     m_dataCache_ptr = nullptr;


     m_instCache_ptr = p->icache;

     m_dataCache_ptr = p->dcache;

     m_max_outstanding_requests = p->max_outstanding_requests;

     m_deadlock_threshold = p->deadlock_threshold;


     assert(m_max_outstanding_requests > 0);

     assert(m_deadlock_threshold > 0);

     assert(m_instCache_ptr);

     assert(m_dataCache_ptr);


     m_data_cache_hit_latency = p->dcache_hit_latency;


     m_runningGarnetStandalone = p->garnet_standalone;

     assumingRfOCoherence = p->assume_rfo;

 }


 GPUCoalescer::~GPUCoalescer()

 {

 }


 void

 GPUCoalescer::wakeup()

 {

     // Check for deadlock of any of the requests

     Cycles current_time = curCycle();


     // Check across all outstanding requests

     int total_outstanding = 0;


     RequestTable::iterator read = m_readRequestTable.begin();

     RequestTable::iterator read_end = m_readRequestTable.end();

     for (; read != read_end; ++read) {

         GPUCoalescerRequest* request = read->second;

         if (current_time - request->issue_time < m_deadlock_threshold)

             continue;


         panic("Possible Deadlock detected. Aborting!\n"

              "version: %d request.paddr: 0x%x m_readRequestTable: %d "

              "current time: %u issue_time: %d difference: %d\n", m_version,

               request->pkt->getAddr(), m_readRequestTable.size(),

               current_time * clockPeriod(), request->issue_time * clockPeriod(),

               (current_time - request->issue_time)*clockPeriod());

     }


     RequestTable::iterator write = m_writeRequestTable.begin();

     RequestTable::iterator write_end = m_writeRequestTable.end();

     for (; write != write_end; ++write) {

         GPUCoalescerRequest* request = write->second;

         if (current_time - request->issue_time < m_deadlock_threshold)

             continue;


         panic("Possible Deadlock detected. Aborting!\n"

              "version: %d request.paddr: 0x%x m_writeRequestTable: %d "

              "current time: %u issue_time: %d difference: %d\n", m_version,

               request->pkt->getAddr(), m_writeRequestTable.size(),

               current_time * clockPeriod(), request->issue_time * clockPeriod(),

               (current_time - request->issue_time) * clockPeriod());

     }


     total_outstanding += m_writeRequestTable.size();

     total_outstanding += m_readRequestTable.size();


     assert(m_outstanding_count == total_outstanding);


     if (m_outstanding_count > 0) {

         // If there are still outstanding requests, keep checking

         schedule(deadlockCheckEvent,

                  m_deadlock_threshold * clockPeriod() +

                  curTick());

     }

 }


 void

 GPUCoalescer::resetStats()

 {

     m_latencyHist.reset();

     m_missLatencyHist.reset();

     for (int i = 0; i < RubyRequestType_NUM; i++) {

         m_typeLatencyHist[i]->reset();

         m_missTypeLatencyHist[i]->reset();

         for (int j = 0; j < MachineType_NUM; j++) {

             m_missTypeMachLatencyHist[i][j]->reset();

         }

     }


     for (int i = 0; i < MachineType_NUM; i++) {

         m_missMachLatencyHist[i]->reset();


         m_IssueToInitialDelayHist[i]->reset();

         m_InitialToForwardDelayHist[i]->reset();

         m_ForwardToFirstResponseDelayHist[i]->reset();

         m_FirstResponseToCompletionDelayHist[i]->reset();

     }

 }


 void

 GPUCoalescer::printProgress(ostream& out) const

 {

 }


 RequestStatus

 GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)

 {

     Addr line_addr = makeLineAddress(pkt->getAddr());


     if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {

         return RequestStatus_BufferFull;

     }


     if (m_controller->isBlocked(line_addr) &&

        request_type != RubyRequestType_Locked_RMW_Write) {

         return RequestStatus_Aliased;

     }


     if ((request_type == RubyRequestType_ST) ||

         (request_type == RubyRequestType_ATOMIC) ||

         (request_type == RubyRequestType_ATOMIC_RETURN) ||

         (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||

         (request_type == RubyRequestType_RMW_Read) ||

         (request_type == RubyRequestType_RMW_Write) ||

         (request_type == RubyRequestType_Load_Linked) ||

         (request_type == RubyRequestType_Store_Conditional) ||

         (request_type == RubyRequestType_Locked_RMW_Read) ||

         (request_type == RubyRequestType_Locked_RMW_Write) ||

         (request_type == RubyRequestType_FLUSH)) {


         // Check if there is any outstanding read request for the same

         // cache line.

         if (m_readRequestTable.count(line_addr) > 0) {

             m_store_waiting_on_load_cycles++;

             return RequestStatus_Aliased;

         }


         if (m_writeRequestTable.count(line_addr) > 0) {

           // There is an outstanding write request for the cache line

           m_store_waiting_on_store_cycles++;

           return RequestStatus_Aliased;

         }

     } else {

         // Check if there is any outstanding write request for the same

         // cache line.

         if (m_writeRequestTable.count(line_addr) > 0) {

             m_load_waiting_on_store_cycles++;

             return RequestStatus_Aliased;

         }


         if (m_readRequestTable.count(line_addr) > 0) {

             // There is an outstanding read request for the cache line

             m_load_waiting_on_load_cycles++;

             return RequestStatus_Aliased;

         }

     }


     return RequestStatus_Ready;


 }


 // sets the kernelEndList

 void

 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)

 {

     // Don't know if this will happen or is possible

     // but I just want to be careful and not have it become

     // simulator hang in the future

     DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);

     assert(kernelEndList.count(wavefront_id) == 0);


     kernelEndList[wavefront_id] = pkt;

     DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",

             kernelEndList.size());

 }


 // Insert the request on the correct request table.  Return true if

 // the entry was already present.

 bool

 GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)

 {

     assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||

            pkt->req->isLockedRMW() ||

            !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));


     int total_outstanding M5_VAR_USED =

         m_writeRequestTable.size() + m_readRequestTable.size();


     assert(m_outstanding_count == total_outstanding);


     // See if we should schedule a deadlock check

     if (!deadlockCheckEvent.scheduled()) {

         schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());

     }


     Addr line_addr = makeLineAddress(pkt->getAddr());

     if ((request_type == RubyRequestType_ST) ||

         (request_type == RubyRequestType_ATOMIC) ||

         (request_type == RubyRequestType_ATOMIC_RETURN) ||

         (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||

         (request_type == RubyRequestType_RMW_Read) ||

         (request_type == RubyRequestType_RMW_Write) ||

         (request_type == RubyRequestType_Load_Linked) ||

         (request_type == RubyRequestType_Store_Conditional) ||

         (request_type == RubyRequestType_Locked_RMW_Read) ||

         (request_type == RubyRequestType_Locked_RMW_Write) ||

         (request_type == RubyRequestType_FLUSH)) {


         pair<RequestTable::iterator, bool> r =

           m_writeRequestTable.insert(RequestTable::value_type(line_addr,

                                        (GPUCoalescerRequest*) NULL));

         if (r.second) {

             RequestTable::iterator i = r.first;

             i->second = new GPUCoalescerRequest(pkt, request_type,

                                                 curCycle());

             DPRINTF(GPUCoalescer,

                     "Inserting write request for paddr %#x for type %d\n",

                     pkt->req->getPaddr(), i->second->m_type);

             m_outstanding_count++;

         } else {

             return true;

         }

     } else {

         pair<RequestTable::iterator, bool> r =

             m_readRequestTable.insert(RequestTable::value_type(line_addr,

                                         (GPUCoalescerRequest*) NULL));


         if (r.second) {

             RequestTable::iterator i = r.first;

             i->second = new GPUCoalescerRequest(pkt, request_type,

                                              curCycle());

             DPRINTF(GPUCoalescer,

                     "Inserting read request for paddr %#x for type %d\n",

                     pkt->req->getPaddr(), i->second->m_type);

             m_outstanding_count++;

         } else {

             return true;

         }

     }


     m_outstandReqHist.sample(m_outstanding_count);


     total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();

     assert(m_outstanding_count == total_outstanding);


     return false;

 }


 void

 GPUCoalescer::markRemoved()

 {

     m_outstanding_count--;

     assert(m_outstanding_count ==

            m_writeRequestTable.size() + m_readRequestTable.size());

 }


 void

 GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)

 {

     assert(m_outstanding_count ==

            m_writeRequestTable.size() + m_readRequestTable.size());


     Addr line_addr = makeLineAddress(srequest->pkt->getAddr());

     if ((srequest->m_type == RubyRequestType_ST) ||

         (srequest->m_type == RubyRequestType_RMW_Read) ||

         (srequest->m_type == RubyRequestType_RMW_Write) ||

         (srequest->m_type == RubyRequestType_Load_Linked) ||

         (srequest->m_type == RubyRequestType_Store_Conditional) ||

         (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||

         (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {

         m_writeRequestTable.erase(line_addr);

     } else {

         m_readRequestTable.erase(line_addr);

     }


     markRemoved();

 }


 bool

 GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)

 {

     //

     // The success flag indicates whether the LLSC operation was successful.

     // LL ops will always succeed, but SC may fail if the cache line is no

     // longer locked.

     //

     bool success = true;

     if (request->m_type == RubyRequestType_Store_Conditional) {

         if (!m_dataCache_ptr->isLocked(address, m_version)) {

             //

             // For failed SC requests, indicate the failure to the cpu by

             // setting the extra data to zero.

             //

             request->pkt->req->setExtraData(0);

             success = false;

         } else {

             //

             // For successful SC requests, indicate the success to the cpu by

             // setting the extra data to one.

             //

             request->pkt->req->setExtraData(1);

         }

         //

         // Independent of success, all SC operations must clear the lock

         //

         m_dataCache_ptr->clearLocked(address);

     } else if (request->m_type == RubyRequestType_Load_Linked) {

         //

         // Note: To fully follow Alpha LLSC semantics, should the LL clear any

         // previously locked cache lines?

         //

         m_dataCache_ptr->setLocked(address, m_version);

     } else if ((m_dataCache_ptr->isTagPresent(address)) &&

                (m_dataCache_ptr->isLocked(address, m_version))) {

         //

         // Normal writes should clear the locked address

         //

         m_dataCache_ptr->clearLocked(address);

     }

     return success;

 }


 void

 GPUCoalescer::writeCallback(Addr address, DataBlock& data)

 {

     writeCallback(address, MachineType_NULL, data);

 }


 void

 GPUCoalescer::writeCallback(Addr address,

                          MachineType mach,

                          DataBlock& data)

 {

     writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));

 }


 void

 GPUCoalescer::writeCallback(Addr address,

                          MachineType mach,

                          DataBlock& data,

                          Cycles initialRequestTime,

                          Cycles forwardRequestTime,

                          Cycles firstResponseTime)

 {

     writeCallback(address, mach, data,

                   initialRequestTime, forwardRequestTime, firstResponseTime,

                   false);

 }


 void

 GPUCoalescer::writeCallback(Addr address,

                          MachineType mach,

                          DataBlock& data,

                          Cycles initialRequestTime,

                          Cycles forwardRequestTime,

                          Cycles firstResponseTime,

                          bool isRegion)

 {

     assert(address == makeLineAddress(address));


     DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);

     assert(m_writeRequestTable.count(makeLineAddress(address)));


     RequestTable::iterator i = m_writeRequestTable.find(address);

     assert(i != m_writeRequestTable.end());

     GPUCoalescerRequest* request = i->second;


     m_writeRequestTable.erase(i);

     markRemoved();


     assert((request->m_type == RubyRequestType_ST) ||

            (request->m_type == RubyRequestType_ATOMIC) ||

            (request->m_type == RubyRequestType_ATOMIC_RETURN) ||

            (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||

            (request->m_type == RubyRequestType_RMW_Read) ||

            (request->m_type == RubyRequestType_RMW_Write) ||

            (request->m_type == RubyRequestType_Load_Linked) ||

            (request->m_type == RubyRequestType_Store_Conditional) ||

            (request->m_type == RubyRequestType_Locked_RMW_Read) ||

            (request->m_type == RubyRequestType_Locked_RMW_Write) ||

            (request->m_type == RubyRequestType_FLUSH));


     //

     // For Alpha, properly handle LL, SC, and write requests with respect to

     // locked cache blocks.

     //

     // Not valid for Garnet_standalone protocl

     //

     bool success = true;

     if (!m_runningGarnetStandalone)

         success = handleLlsc(address, request);


     if (request->m_type == RubyRequestType_Locked_RMW_Read) {

         m_controller->blockOnQueue(address, m_mandatory_q_ptr);

     } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {

         m_controller->unblock(address);

     }


     hitCallback(request, mach, data, success,

                 request->issue_time, forwardRequestTime, firstResponseTime,

                 isRegion);

 }


 void

 GPUCoalescer::readCallback(Addr address, DataBlock& data)

 {

     readCallback(address, MachineType_NULL, data);

 }


 void

 GPUCoalescer::readCallback(Addr address,

                         MachineType mach,

                         DataBlock& data)

 {

     readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));

 }


 void

 GPUCoalescer::readCallback(Addr address,

                         MachineType mach,

                         DataBlock& data,

                         Cycles initialRequestTime,

                         Cycles forwardRequestTime,

                         Cycles firstResponseTime)

 {


     readCallback(address, mach, data,

                  initialRequestTime, forwardRequestTime, firstResponseTime,

                  false);

 }


 void

 GPUCoalescer::readCallback(Addr address,

                         MachineType mach,

                         DataBlock& data,

                         Cycles initialRequestTime,

                         Cycles forwardRequestTime,

                         Cycles firstResponseTime,

                         bool isRegion)

 {

     assert(address == makeLineAddress(address));

     assert(m_readRequestTable.count(makeLineAddress(address)));


     DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);

     RequestTable::iterator i = m_readRequestTable.find(address);

     assert(i != m_readRequestTable.end());

     GPUCoalescerRequest* request = i->second;


     m_readRequestTable.erase(i);

     markRemoved();


     assert((request->m_type == RubyRequestType_LD) ||

            (request->m_type == RubyRequestType_IFETCH));


     hitCallback(request, mach, data, true,

                 request->issue_time, forwardRequestTime, firstResponseTime,

                 isRegion);

 }


 void

 GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,

                        MachineType mach,

                        DataBlock& data,

                        bool success,

                        Cycles initialRequestTime,

                        Cycles forwardRequestTime,

                        Cycles firstResponseTime,

                        bool isRegion)

 {

     PacketPtr pkt = srequest->pkt;

     Addr request_address = pkt->getAddr();

     Addr request_line_address = makeLineAddress(request_address);


     RubyRequestType type = srequest->m_type;


     // Set this cache entry to the most recently used

     if (type == RubyRequestType_IFETCH) {

         if (m_instCache_ptr->isTagPresent(request_line_address))

             m_instCache_ptr->setMRU(request_line_address);

     } else {

         if (m_dataCache_ptr->isTagPresent(request_line_address))

             m_dataCache_ptr->setMRU(request_line_address);

     }


     recordMissLatency(srequest, mach,

                       initialRequestTime,

                       forwardRequestTime,

                       firstResponseTime,

                       success, isRegion);

     // update the data

     //

     // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER

     int len = reqCoalescer[request_line_address].size();

     std::vector<PacketPtr> mylist;

     for (int i = 0; i < len; ++i) {

         PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;

         assert(type == reqCoalescer[request_line_address][i].primaryType);

         request_address = pkt->getAddr();

         request_line_address = makeLineAddress(pkt->getAddr());

         if (pkt->getPtr<uint8_t>()) {

             if ((type == RubyRequestType_LD) ||

                 (type == RubyRequestType_ATOMIC) ||

                 (type == RubyRequestType_ATOMIC_RETURN) ||

                 (type == RubyRequestType_IFETCH) ||

                 (type == RubyRequestType_RMW_Read) ||

                 (type == RubyRequestType_Locked_RMW_Read) ||

                 (type == RubyRequestType_Load_Linked)) {

                 memcpy(pkt->getPtr<uint8_t>(),

                        data.getData(getOffset(request_address),

                                     pkt->getSize()),

                        pkt->getSize());

             } else {

                 data.setData(pkt->getPtr<uint8_t>(),

                              getOffset(request_address), pkt->getSize());

             }

         } else {

             DPRINTF(MemoryAccess,

                     "WARNING.  Data not transfered from Ruby to M5 for type " \

                     "%s\n",

                     RubyRequestType_to_string(type));

         }


         // If using the RubyTester, update the RubyTester sender state's

         // subBlock with the recieved data.  The tester will later access

         // this state.

         // Note: RubyPort will access it's sender state before the

         // RubyTester.

         if (m_usingRubyTester) {

             RubyPort::SenderState *requestSenderState =

                 safe_cast<RubyPort::SenderState*>(pkt->senderState);

             RubyTester::SenderState* testerSenderState =

                 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);

             testerSenderState->subBlock.mergeFrom(data);

         }


         mylist.push_back(pkt);

     }

     delete srequest;

     reqCoalescer.erase(request_line_address);

     assert(!reqCoalescer.count(request_line_address));


     completeHitCallback(mylist, len);

 }


 bool

 GPUCoalescer::empty() const

 {

     return m_writeRequestTable.empty() && m_readRequestTable.empty();

 }


 // Analyzes the packet to see if this request can be coalesced.

 // If request can be coalesced, this request is added to the reqCoalescer table

 // and makeRequest returns RequestStatus_Issued;

 // If this is the first request to a cacheline, request is added to both

 // newRequests queue and to the reqCoalescer table; makeRequest

 // returns RequestStatus_Issued.

 // If there is a pending request to this cacheline and this request

 // can't be coalesced, RequestStatus_Aliased is returned and

 // the packet needs to be reissued.

 RequestStatus

 GPUCoalescer::makeRequest(PacketPtr pkt)

 {

     // Check for GPU Barrier Kernel End or Kernel Begin

     // Leave these to be handled by the child class

     // Kernel End/Barrier = isFlush + isRelease

     // Kernel Begin = isFlush + isAcquire

     if (pkt->req->isKernel()) {

         if (pkt->req->isAcquire()){

             // This is a Kernel Begin leave handling to

             // virtual xCoalescer::makeRequest

             return RequestStatus_Issued;

         }else if (pkt->req->isRelease()) {

             // This is a Kernel End leave handling to

             // virtual xCoalescer::makeRequest

             // If we are here then we didn't call

             // a virtual version of this function

             // so we will also schedule the callback

             int wf_id = 0;

             if (pkt->req->hasContextId()) {

                 wf_id = pkt->req->contextId();

             }

             insertKernel(wf_id, pkt);

             newKernelEnds.push_back(wf_id);

             if (!issueEvent.scheduled()) {

                 schedule(issueEvent, curTick());

             }

             return RequestStatus_Issued;

         }

     }


     // If number of outstanding requests greater than the max allowed,

     // return RequestStatus_BufferFull. This logic can be extended to

     // support proper backpressure.

     if (m_outstanding_count >= m_max_outstanding_requests) {

         return RequestStatus_BufferFull;

     }


     RubyRequestType primary_type = RubyRequestType_NULL;

     RubyRequestType secondary_type = RubyRequestType_NULL;


     if (pkt->isLLSC()) {

         //

         // Alpha LL/SC instructions need to be handled carefully by the cache

         // coherence protocol to ensure they follow the proper semantics. In

         // particular, by identifying the operations as atomic, the protocol

         // should understand that migratory sharing optimizations should not

         // be performed (i.e. a load between the LL and SC should not steal

         // away exclusive permission).

         //

         if (pkt->isWrite()) {

             primary_type = RubyRequestType_Store_Conditional;

         } else {

             assert(pkt->isRead());

             primary_type = RubyRequestType_Load_Linked;

         }

         secondary_type = RubyRequestType_ATOMIC;

     } else if (pkt->req->isLockedRMW()) {

         //

         // x86 locked instructions are translated to store cache coherence

         // requests because these requests should always be treated as read

         // exclusive operations and should leverage any migratory sharing

         // optimization built into the protocol.

         //

         if (pkt->isWrite()) {

             primary_type = RubyRequestType_Locked_RMW_Write;

         } else {

             assert(pkt->isRead());

             primary_type = RubyRequestType_Locked_RMW_Read;

         }

         secondary_type = RubyRequestType_ST;

     } else if (pkt->isAtomicOp()) {

         //

         // GPU Atomic Operation

         //

         primary_type = RubyRequestType_ATOMIC;

         secondary_type = RubyRequestType_ATOMIC;

     } else {

         if (pkt->isRead()) {

             if (pkt->req->isInstFetch()) {

                 primary_type = secondary_type = RubyRequestType_IFETCH;

             } else {

 #if THE_ISA == X86_ISA

                 uint32_t flags = pkt->req->getFlags();

                 bool storeCheck = flags &

                         (TheISA::StoreCheck << TheISA::FlagShift);

 #else

                 bool storeCheck = false;

 #endif // X86_ISA

                 if (storeCheck) {

                     primary_type = RubyRequestType_RMW_Read;

                     secondary_type = RubyRequestType_ST;

                 } else {

                     primary_type = secondary_type = RubyRequestType_LD;

                 }

             }

         } else if (pkt->isWrite()) {

             //

             // Note: M5 packets do not differentiate ST from RMW_Write

             //

             primary_type = secondary_type = RubyRequestType_ST;

         } else if (pkt->isFlush()) {

             primary_type = secondary_type = RubyRequestType_FLUSH;

         } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {

             if (assumingRfOCoherence) {

                 // If we reached here, this request must be a memFence

                 // and the protocol implements RfO, the coalescer can

                 // assume sequentially consistency and schedule the callback

                 // immediately.

                 // Currently the code implements fence callbacks

                 // by reusing the mechanism for kernel completions.

                 // This should be fixed.

                 int wf_id = 0;

                 if (pkt->req->hasContextId()) {

                     wf_id = pkt->req->contextId();

                 }

                 insertKernel(wf_id, pkt);

                 newKernelEnds.push_back(wf_id);

                 if (!issueEvent.scheduled()) {

                     schedule(issueEvent, curTick());

                 }

                 return RequestStatus_Issued;

             } else {

                 // If not RfO, return issued here and let the child coalescer

                 // take care of it.

                 return RequestStatus_Issued;

             }

         } else {

             panic("Unsupported ruby packet type\n");

         }

     }


     // Check if there is any pending request to this cache line from

     // previous cycles.

     // If there is a pending request, return aliased. Since coalescing

     // across time is not permitted, aliased requests are not coalesced.

     // If a request for this address has already been issued, we must block

     RequestStatus status = getRequestStatus(pkt, primary_type);

     if (status != RequestStatus_Ready)

         return status;


     Addr line_addr = makeLineAddress(pkt->getAddr());


     // Check if this request can be coalesced with previous

     // requests from this cycle.

     if (!reqCoalescer.count(line_addr)) {

         // This is the first access to this cache line.

         // A new request to the memory subsystem has to be

         // made in the next cycle for this cache line, so

         // add this line addr to the "newRequests" queue

         newRequests.push_back(line_addr);


     // There was a request to this cache line in this cycle,

     // let us see if we can coalesce this request with the previous

     // requests from this cycle

     } else if (primary_type !=

                reqCoalescer[line_addr][0].primaryType) {

         // can't coalesce loads, stores and atomics!

         return RequestStatus_Aliased;

     } else if (pkt->req->isLockedRMW() ||

                reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {

         // can't coalesce locked accesses, but can coalesce atomics!

         return RequestStatus_Aliased;

     } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&

                pkt->req->contextId() !=

                reqCoalescer[line_addr][0].pkt->req->contextId()) {

         // can't coalesce releases from different wavefronts

         return RequestStatus_Aliased;

     }


     // in addition to the packet, we need to save both request types

     reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);

     if (!issueEvent.scheduled())

         schedule(issueEvent, curTick());

     // TODO: issue hardware prefetches here

     return RequestStatus_Issued;

 }


 void

 GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)

 {


     int proc_id = -1;

     if (pkt != NULL && pkt->req->hasContextId()) {

         proc_id = pkt->req->contextId();

     }


     // If valid, copy the pc to the ruby request

     Addr pc = 0;

     if (pkt->req->hasPC()) {

         pc = pkt->req->getPC();

     }


     // At the moment setting scopes only counts

     // for GPU spill space accesses

     // which is pkt->req->isStack()

     // this scope is REPLACE since it

     // does not need to be flushed at the end

     // of a kernel Private and local may need

     // to be visible at the end of the kernel

     HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);

     HSAScope accessScope = reqScopeToHSAScope(pkt->req);


     Addr line_addr = makeLineAddress(pkt->getAddr());


     // Creating WriteMask that records written bytes

     // and atomic operations. This enables partial writes

     // and partial reads of those writes

     DataBlock dataBlock;

     dataBlock.clear();

     uint32_t blockSize = RubySystem::getBlockSizeBytes();

     std::vector<bool> accessMask(blockSize,false);

     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;

     uint32_t tableSize = reqCoalescer[line_addr].size();

     for (int i = 0; i < tableSize; i++) {

         PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;

         uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;

         uint32_t tmpSize = tmpPkt->getSize();

         if (tmpPkt->isAtomicOp()) {

             std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,

                                                         tmpPkt->getAtomicOp());

             atomicOps.push_back(tmpAtomicOp);

         } else if (tmpPkt->isWrite()) {

             dataBlock.setData(tmpPkt->getPtr<uint8_t>(),

                               tmpOffset, tmpSize);

         }

         for (int j = 0; j < tmpSize; j++) {

             accessMask[tmpOffset + j] = true;

         }

     }

     std::shared_ptr<RubyRequest> msg;

     if (pkt->isAtomicOp()) {

         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),

                               pkt->getPtr<uint8_t>(),

                               pkt->getSize(), pc, secondary_type,

                               RubyAccessMode_Supervisor, pkt,

                               PrefetchBit_No, proc_id, 100,

                               blockSize, accessMask,

                               dataBlock, atomicOps,

                               accessScope, accessSegment);

     } else {

         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),

                               pkt->getPtr<uint8_t>(),

                               pkt->getSize(), pc, secondary_type,

                               RubyAccessMode_Supervisor, pkt,

                               PrefetchBit_No, proc_id, 100,

                               blockSize, accessMask,

                               dataBlock,

                               accessScope, accessSegment);

     }

     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",

              curTick(), m_version, "Coal", "Begin", "", "",

              printAddress(msg->getPhysicalAddress()),

              RubyRequestType_to_string(secondary_type));


     fatal_if(secondary_type == RubyRequestType_IFETCH,

              "there should not be any I-Fetch requests in the GPU Coalescer");


     // Send the message to the cache controller

     fatal_if(m_data_cache_hit_latency == 0,

              "should not have a latency of zero");


     assert(m_mandatory_q_ptr);

     m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);

 }


 template <class KEY, class VALUE>

 std::ostream &

 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)

 {

     out << "[";

     for (auto i = map.begin(); i != map.end(); ++i)

         out << " " << i->first << "=" << i->second;

     out << " ]";


     return out;

 }


 void

 GPUCoalescer::print(ostream& out) const

 {

     out << "[GPUCoalescer: " << m_version

         << ", outstanding requests: " << m_outstanding_count

         << ", read request table: " << m_readRequestTable

         << ", write request table: " << m_writeRequestTable

         << "]";

 }


 // this can be called from setState whenever coherence permissions are

 // upgraded when invoked, coherence violations will be checked for the

 // given block

 void

 GPUCoalescer::checkCoherence(Addr addr)

 {

 #ifdef CHECK_COHERENCE

     m_ruby_system->checkGlobalCoherenceInvariant(addr);

 #endif

 }


 void

 GPUCoalescer::recordRequestType(SequencerRequestType requestType) {

     DPRINTF(RubyStats, "Recorded statistic: %s\n",

             SequencerRequestType_to_string(requestType));

 }


 GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)

     : Event(Progress_Event_Pri), seq(_seq)

 {

 }


 void

 GPUCoalescer::completeIssue()

 {

     // newRequests has the cacheline addresses of all the

     // requests which need to be issued to the memory subsystem

     // in this cycle

     int len = newRequests.size();

     DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);

     for (int i = 0; i < len; ++i) {

         // Get the requests from reqCoalescer table. Get only the

         // first request for each cacheline, the remaining requests

         // can be coalesced with the first request. So, only

         // one request is issued per cacheline.

         RequestDesc info = reqCoalescer[newRequests[i]][0];

         PacketPtr pkt = info.pkt;

         DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",

                 i, pkt->req->getPaddr());

         // Insert this request to the read/writeRequestTables. These tables

         // are used to track aliased requests in makeRequest subroutine

         bool found = insertRequest(pkt, info.primaryType);


         if (found) {

             panic("GPUCoalescer::makeRequest should never be called if the "

                   "request is already outstanding\n");

         }


         // Issue request to ruby subsystem

         issueRequest(pkt, info.secondaryType);

     }

     newRequests.clear();


     // have Kernel End releases been issued this cycle

     len = newKernelEnds.size();

     for (int i = 0; i < len; i++) {

         kernelCallback(newKernelEnds[i]);

     }

     newKernelEnds.clear();

 }


 void

 GPUCoalescer::IssueEvent::process()

 {

     seq->completeIssue();

 }


 const char *

 GPUCoalescer::IssueEvent::description() const

 {

     return "Issue coalesced request";

 }


 void

 GPUCoalescer::evictionCallback(Addr address)

 {

     ruby_eviction_callback(address);

 }


 void

 GPUCoalescer::kernelCallback(int wavefront_id)

 {

     assert(kernelEndList.count(wavefront_id));


     ruby_hit_callback(kernelEndList[wavefront_id]);


     kernelEndList.erase(wavefront_id);

 }


 void

 GPUCoalescer::atomicCallback(Addr address,

                              MachineType mach,

                              const DataBlock& data)

 {

     assert(address == makeLineAddress(address));


     DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);

     assert(m_writeRequestTable.count(makeLineAddress(address)));


     RequestTable::iterator i = m_writeRequestTable.find(address);

     assert(i != m_writeRequestTable.end());

     GPUCoalescerRequest* srequest = i->second;


     m_writeRequestTable.erase(i);

     markRemoved();


     assert((srequest->m_type == RubyRequestType_ATOMIC) ||

            (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||

            (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));


     // Atomics don't write to cache, so there is no MRU update...


     recordMissLatency(srequest, mach,

                       srequest->issue_time, Cycles(0), Cycles(0), true, false);


     PacketPtr pkt = srequest->pkt;

     Addr request_address = pkt->getAddr();

     Addr request_line_address = makeLineAddress(pkt->getAddr());


     int len = reqCoalescer[request_line_address].size();

     std::vector<PacketPtr> mylist;

     for (int i = 0; i < len; ++i) {

         PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;

         assert(srequest->m_type ==

                reqCoalescer[request_line_address][i].primaryType);

         request_address = (pkt->getAddr());

         request_line_address = makeLineAddress(request_address);

         if (pkt->getPtr<uint8_t>() &&

             srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {

             /* atomics are done in memory, and return the data *before* the atomic op... */

             memcpy(pkt->getPtr<uint8_t>(),

                    data.getData(getOffset(request_address),

                                 pkt->getSize()),

                    pkt->getSize());

         } else {

             DPRINTF(MemoryAccess,

                     "WARNING.  Data not transfered from Ruby to M5 for type " \

                     "%s\n",

                     RubyRequestType_to_string(srequest->m_type));

         }


         // If using the RubyTester, update the RubyTester sender state's

         // subBlock with the recieved data.  The tester will later access

         // this state.

         // Note: RubyPort will access it's sender state before the

         // RubyTester.

         if (m_usingRubyTester) {

             RubyPort::SenderState *requestSenderState =

                 safe_cast<RubyPort::SenderState*>(pkt->senderState);

             RubyTester::SenderState* testerSenderState =

                 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);

             testerSenderState->subBlock.mergeFrom(data);

         }


         mylist.push_back(pkt);

     }

     delete srequest;

     reqCoalescer.erase(request_line_address);

     assert(!reqCoalescer.count(request_line_address));


     completeHitCallback(mylist, len);

 }


 void

 GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)

 {

     if (myMachID == senderMachID) {

         CP_TCPLdHits++;

     } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {

         CP_TCPLdTransfers++;

     } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {

         CP_TCCLdHits++;

     } else {

         CP_LdMiss++;

     }

 }


 void

 GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)

 {

     if (myMachID == senderMachID) {

         CP_TCPStHits++;

     } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {

         CP_TCPStTransfers++;

     } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {

         CP_TCCStHits++;

     } else {

         CP_StMiss++;

     }

 }


 void

 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)

 {

     for (int i = 0; i < len; ++i) {

         RubyPort::SenderState *ss =

             safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);

         MemSlavePort *port = ss->port;

         assert(port != NULL);


         mylist[i]->senderState = ss->predecessor;

         delete ss;

         port->hitCallback(mylist[i]);

         trySendRetries();

     }


     testDrainComplete();

 }


 PacketPtr

 GPUCoalescer::mapAddrToPkt(Addr address)

 {

     RequestTable::iterator i = m_readRequestTable.find(address);

     assert(i != m_readRequestTable.end());

     GPUCoalescerRequest* request = i->second;

     return request->pkt;

 }


 void

 GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,

                                 MachineType mach,

                                 Cycles initialRequestTime,

                                 Cycles forwardRequestTime,

                                 Cycles firstResponseTime,

                                 bool success, bool isRegion)

 {

     RubyRequestType type = srequest->m_type;

     Cycles issued_time = srequest->issue_time;

     Cycles completion_time = curCycle();

     assert(completion_time >= issued_time);

     Cycles total_lat = completion_time - issued_time;


     // cache stats (valid for RfO protocol only)

     if (mach == MachineType_TCP) {

         if (type == RubyRequestType_LD) {

             GPU_TCPLdHits++;

         } else {

             GPU_TCPStHits++;

         }

     } else if (mach == MachineType_L1Cache_wCC) {

         if (type == RubyRequestType_LD) {

             GPU_TCPLdTransfers++;

         } else {

             GPU_TCPStTransfers++;

         }

     } else if (mach == MachineType_TCC) {

         if (type == RubyRequestType_LD) {

             GPU_TCCLdHits++;

         } else {

             GPU_TCCStHits++;

         }

     } else  {

         if (type == RubyRequestType_LD) {

             GPU_LdMiss++;

         } else {

             GPU_StMiss++;

         }

     }


     // Profile all access latency, even zero latency accesses

     m_latencyHist.sample(total_lat);

     m_typeLatencyHist[type]->sample(total_lat);


     // Profile the miss latency for all non-zero demand misses

     if (total_lat != Cycles(0)) {

         m_missLatencyHist.sample(total_lat);

         m_missTypeLatencyHist[type]->sample(total_lat);


         if (mach != MachineType_NUM) {

             m_missMachLatencyHist[mach]->sample(total_lat);

             m_missTypeMachLatencyHist[type][mach]->sample(total_lat);


             if ((issued_time <= initialRequestTime) &&

                 (initialRequestTime <= forwardRequestTime) &&

                 (forwardRequestTime <= firstResponseTime) &&

                 (firstResponseTime <= completion_time)) {


                 m_IssueToInitialDelayHist[mach]->sample(

                     initialRequestTime - issued_time);

                 m_InitialToForwardDelayHist[mach]->sample(

                     forwardRequestTime - initialRequestTime);

                 m_ForwardToFirstResponseDelayHist[mach]->sample(

                     firstResponseTime - forwardRequestTime);

                 m_FirstResponseToCompletionDelayHist[mach]->sample(

                     completion_time - firstResponseTime);

             }

         }


     }


     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",

              curTick(), m_version, "Coal",

              success ? "Done" : "SC_Failed", "", "",

              printAddress(srequest->pkt->getAddr()), total_lat);

 }


 void

 GPUCoalescer::regStats()

 {

     RubyPort::regStats();


     // These statistical variables are not for display.

     // The profiler will collate these across different

     // coalescers and display those collated statistics.

     m_outstandReqHist.init(10);

     m_latencyHist.init(10);

     m_missLatencyHist.init(10);


     for (int i = 0; i < RubyRequestType_NUM; i++) {

         m_typeLatencyHist.push_back(new Stats::Histogram());

         m_typeLatencyHist[i]->init(10);


         m_missTypeLatencyHist.push_back(new Stats::Histogram());

         m_missTypeLatencyHist[i]->init(10);

     }


     for (int i = 0; i < MachineType_NUM; i++) {

         m_missMachLatencyHist.push_back(new Stats::Histogram());

         m_missMachLatencyHist[i]->init(10);


         m_IssueToInitialDelayHist.push_back(new Stats::Histogram());

         m_IssueToInitialDelayHist[i]->init(10);


         m_InitialToForwardDelayHist.push_back(new Stats::Histogram());

         m_InitialToForwardDelayHist[i]->init(10);


         m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());

         m_ForwardToFirstResponseDelayHist[i]->init(10);


         m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());

         m_FirstResponseToCompletionDelayHist[i]->init(10);

     }


     for (int i = 0; i < RubyRequestType_NUM; i++) {

         m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());


         for (int j = 0; j < MachineType_NUM; j++) {

             m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());

             m_missTypeMachLatencyHist[i][j]->init(10);

         }

     }


     // GPU cache stats

     GPU_TCPLdHits

         .name(name() + ".gpu_tcp_ld_hits")

         .desc("loads that hit in the TCP")

         ;

     GPU_TCPLdTransfers

         .name(name() + ".gpu_tcp_ld_transfers")

         .desc("TCP to TCP load transfers")

         ;

     GPU_TCCLdHits

         .name(name() + ".gpu_tcc_ld_hits")

         .desc("loads that hit in the TCC")

         ;

     GPU_LdMiss

         .name(name() + ".gpu_ld_misses")

         .desc("loads that miss in the GPU")

         ;


     GPU_TCPStHits

         .name(name() + ".gpu_tcp_st_hits")

         .desc("stores that hit in the TCP")

         ;

     GPU_TCPStTransfers

         .name(name() + ".gpu_tcp_st_transfers")

         .desc("TCP to TCP store transfers")

         ;

     GPU_TCCStHits

         .name(name() + ".gpu_tcc_st_hits")

         .desc("stores that hit in the TCC")

         ;

     GPU_StMiss

         .name(name() + ".gpu_st_misses")

         .desc("stores that miss in the GPU")

         ;


     // CP cache stats

     CP_TCPLdHits

         .name(name() + ".cp_tcp_ld_hits")

         .desc("loads that hit in the TCP")

         ;

     CP_TCPLdTransfers

         .name(name() + ".cp_tcp_ld_transfers")

         .desc("TCP to TCP load transfers")

         ;

     CP_TCCLdHits

         .name(name() + ".cp_tcc_ld_hits")

         .desc("loads that hit in the TCC")

         ;

     CP_LdMiss

         .name(name() + ".cp_ld_misses")

         .desc("loads that miss in the GPU")

         ;


     CP_TCPStHits

         .name(name() + ".cp_tcp_st_hits")

         .desc("stores that hit in the TCP")

         ;

     CP_TCPStTransfers

         .name(name() + ".cp_tcp_st_transfers")

         .desc("TCP to TCP store transfers")

         ;

     CP_TCCStHits

         .name(name() + ".cp_tcc_st_hits")

         .desc("stores that hit in the TCC")

         ;

     CP_StMiss

         .name(name() + ".cp_st_misses")

         .desc("stores that miss in the GPU")

         ;

 }

CacheMemory.hh

str.hh

DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:212

GPUCoalescer::recordMissLatency
void recordMissLatency(GPUCoalescerRequest *request, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool success, bool isRegion)
Definition: GPUCoalescer.cc:1203

Packet::isLLSC
bool isLLSC() const
Definition: packet.hh:527

GPUCoalescer::insertKernel
void insertKernel(int wavefront_id, PacketPtr pkt)
Definition: GPUCoalescer.cc:294

GPUCoalescer::atomicCallback
void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
Definition: GPUCoalescer.cc:1073

GPUCoalescer::CP_TCCStHits
Stats::Scalar CP_TCCStHits
Definition: GPUCoalescer.hh:344

GPUCoalescer::GPU_TCPStHits
Stats::Scalar GPU_TCPStHits
Definition: GPUCoalescer.hh:332

X86ISA::FlagShift
const int FlagShift
Definition: ldstflags.hh:52

GPUCoalescer::m_deadlock_threshold
int m_deadlock_threshold
Definition: GPUCoalescer.hh:274

Cycles
Cycles is a wrapper class for representing cycle counts, i.e.
Definition: types.hh:83

X86ISA::StoreCheck
Definition: ldstflags.hh:56

GPUCoalescer::m_missLatencyHist
Stats::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
Definition: GPUCoalescer.hh:356

ArmISA::i
Bitfield< 7 > i
Definition: miscregs.hh:1378

AbstractController.hh

GPUCoalescer::m_ForwardToFirstResponseDelayHist
std::vector< Stats::Histogram * > m_ForwardToFirstResponseDelayHist
Definition: GPUCoalescer.hh:367

Request::contextId
ContextID contextId() const
Accessor function for context ID.
Definition: request.hh:694

std::pair
STL pair class.
Definition: stl.hh:61

panic
#define panic(...)
Definition: misc.hh:153

GPUCoalescerRequest::m_type
RubyRequestType m_type
Definition: GPUCoalescer.hh:67

GPUCoalescer::reqCoalescer
CoalescingTable reqCoalescer
Definition: GPUCoalescer.hh:289

DataBlock.hh

GPUCoalescer::issueRequest
virtual void issueRequest(PacketPtr pkt, RubyRequestType type)
Definition: GPUCoalescer.cc:873

GPUCoalescer::GPUCoalescer
GPUCoalescer(const Params *)
Definition: GPUCoalescer.cc:118

Request::setExtraData
void setExtraData(uint64_t extraData)
Accessor function for store conditional return value.
Definition: request.hh:680

Request::isLockedRMW
bool isLockedRMW() const
Definition: request.hh:773

GPUCoalescer::m_readRequestTable
RequestTable m_readRequestTable
Definition: GPUCoalescer.hh:294

GPUCoalescer::GPU_TCPLdHits
Stats::Scalar GPU_TCPLdHits
Definition: GPUCoalescer.hh:327

RubyPort::m_controller
AbstractController * m_controller
Definition: RubyPort.hh:190

RubyPort::SenderState
Definition: RubyPort.hh:139

GPUCoalescer::CP_StMiss
Stats::Scalar CP_StMiss
Definition: GPUCoalescer.hh:345

addr
ip6_addr_t addr
Definition: inet.hh:335

Request::isPrivateSegment
bool isPrivateSegment() const
Definition: request.hh:844

Packet::isWrite
bool isWrite() const
Definition: packet.hh:503

GPUCoalescer::GPU_TCCStHits
Stats::Scalar GPU_TCCStHits
Definition: GPUCoalescer.hh:334

Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:381

GPUCoalescer::recordCPWriteCallBack
void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
Definition: GPUCoalescer.cc:1162

Request::isAcquire
bool isAcquire() const
Definition: request.hh:779

GPUCoalescer::kernelCallback
void kernelCallback(int wavfront_id)
Definition: GPUCoalescer.cc:1063

RubyPort::trySendRetries
void trySendRetries()
Definition: RubyPort.cc:385

GPUCoalescer::CP_LdMiss
Stats::Scalar CP_LdMiss
Definition: GPUCoalescer.hh:340

GPUCoalescer::GPU_TCPLdTransfers
Stats::Scalar GPU_TCPLdTransfers
Definition: GPUCoalescer.hh:328

GPUCoalescer::getRequestStatus
RequestStatus getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
Definition: GPUCoalescer.cc:234

Request::hasPC
bool hasPC() const
Definition: request.hh:708

CacheMemory::clearLocked
void clearLocked(Addr addr)
Definition: CacheMemory.cc:467

GPUCoalescer::~GPUCoalescer
~GPUCoalescer()
Definition: GPUCoalescer.cc:149

Stats::Histogram::init
Histogram & init(size_type size)
Set the parameters of this histogram.
Definition: statistics.hh:2560

CacheMemory::isLocked
bool isLocked(Addr addr, int context)
Definition: CacheMemory.cc:478

RubyTester::SenderState::subBlock
SubBlock subBlock
Definition: RubyTester.hh:87

Request::getPC
Addr getPC() const
Accessor function for pc.
Definition: request.hh:715

GPUCoalescer::CP_TCPLdTransfers
Stats::Scalar CP_TCPLdTransfers
Definition: GPUCoalescer.hh:338

GPUCoalescer::makeRequest
virtual RequestStatus makeRequest(PacketPtr pkt)
Definition: GPUCoalescer.cc:695

GPUCoalescer::m_missMachLatencyHist
std::vector< Stats::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
Definition: GPUCoalescer.hh:361

GPUCoalescer::m_max_outstanding_requests
int m_max_outstanding_requests
Definition: GPUCoalescer.hh:273

SubBlock.hh

GPUCoalescer::wakeup
void wakeup()
Definition: GPUCoalescer.cc:154

GPUCoalescer::m_latencyHist
Stats::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
Definition: GPUCoalescer.hh:351

DataBlock
Definition: DataBlock.hh:40

Request::isDeviceScope
bool isDeviceScope() const
Definition: request.hh:815

GPUCoalescer::completeIssue
void completeIssue()
Definition: GPUCoalescer.cc:1006

Packet::getPtr
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:959

GPUCoalescer::IssueEvent::IssueEvent
IssueEvent(GPUCoalescer *_seq)
Definition: GPUCoalescer.cc:999

RubyPort::m_ruby_system
RubySystem * m_ruby_system
Definition: RubyPort.hh:188

Request::isSpillSegment
bool isSpillSegment() const
Definition: request.hh:862

Request::isRelease
bool isRelease() const
Definition: request.hh:780

std::vector
STL vector class.
Definition: stl.hh:40

Request
Definition: request.hh:87

misc.hh

GPUCoalescer::CP_TCPStTransfers
Stats::Scalar CP_TCPStTransfers
Definition: GPUCoalescer.hh:343

data
const char data[]
Definition: circlebuf.cc:43

Profiler.hh

Request::hasContextId
bool hasContextId() const
Definition: request.hh:687

GPUCoalescer::empty
bool empty() const
Definition: GPUCoalescer.cc:680

AbstractController::isBlocked
bool isBlocked(Addr) const
Definition: AbstractController.cc:199

RubyPort::MemSlavePort
Definition: RubyPort.hh:74

ArmISA::status
Bitfield< 5, 0 > status
Definition: miscregs.hh:1604

Clocked::curCycle
Cycles curCycle() const
Determine the current cycle, corresponding to a tick aligned to a clock edge.
Definition: clocked_object.hh:187

GPUCoalescer::IssueEvent::description
const char * description() const
Return a C string describing the event.
Definition: GPUCoalescer.cc:1051

MessageBuffer::areNSlotsAvailable
bool areNSlotsAvailable(unsigned int n, Tick curTime)
Definition: MessageBuffer.cc:79

Clocked::clockEdge
Tick clockEdge(Cycles cycles=Cycles(0)) const
Determine the tick when a cycle begins, by default the current one, but the argument also enables the...
Definition: clocked_object.hh:170

reqSegmentToHSASegment
HSASegment reqSegmentToHSASegment(Request *req)
Definition: GPUCoalescer.cc:93

curTick
Tick curTick()
The current simulated tick.
Definition: core.hh:47

GPUCoalescer::markRemoved
void markRemoved()
Definition: GPUCoalescer.cc:381

Request::isWorkgroupScope
bool isWorkgroupScope() const
Definition: request.hh:808

GPUCoalescer
Definition: GPUCoalescer.hh:96

GPUCoalescer::GPU_LdMiss
Stats::Scalar GPU_LdMiss
Definition: GPUCoalescer.hh:330

RubyPort::ruby_eviction_callback
void ruby_eviction_callback(Addr address)
Definition: RubyPort.cc:534

GPUCoalescer::m_dataCache_ptr
CacheMemory * m_dataCache_ptr
Definition: GPUCoalescer.hh:276

CacheMemory::setMRU
void setMRU(Addr address)
Definition: CacheMemory.cc:344

Packet::SenderState::predecessor
SenderState * predecessor
Definition: packet.hh:379

RubyTester.hh

GPUCoalescer::readCallback
void readCallback(Addr address, DataBlock &data)
Definition: GPUCoalescer.cc:537

GPUCoalescer::insertRequest
bool insertRequest(PacketPtr pkt, RubyRequestType request_type)
Definition: GPUCoalescer.cc:311

RequestDesc::primaryType
RubyRequestType primaryType
Definition: GPUCoalescer.hh:90

SubBlock::mergeFrom
void mergeFrom(const DataBlock &data)
Definition: SubBlock.hh:60

Request::isKernargSegment
bool isKernargSegment() const
Definition: request.hh:850

GPUCoalescer::writeCallback
void writeCallback(Addr address, DataBlock &data)
Definition: GPUCoalescer.cc:455

GPUCoalescer::assumingRfOCoherence
bool assumingRfOCoherence
Definition: GPUCoalescer.hh:324

GPUCoalescer::newRequests
std::vector< Addr > newRequests
Definition: GPUCoalescer.hh:290

RubyPort::m_version
uint32_t m_version
Definition: RubyPort.hh:189

GPUCoalescer::GPU_TCCLdHits
Stats::Scalar GPU_TCCLdHits
Definition: GPUCoalescer.hh:329

GPUCoalescer::m_store_waiting_on_load_cycles
int m_store_waiting_on_load_cycles
Definition: GPUCoalescer.hh:301

Request::isArgSegment
bool isArgSegment() const
Definition: request.hh:868

Request::getPaddr
Addr getPaddr() const
Definition: request.hh:519

fatal
#define fatal(...)
Definition: misc.hh:163

GPUCoalescerRequest::pkt
PacketPtr pkt
Definition: GPUCoalescer.hh:66

Packet::req
const RequestPtr req
A pointer to the original request.
Definition: packet.hh:304

Stats::Histogram
A simple histogram stat.
Definition: statistics.hh:2551

GPUCoalescer::m_outstandReqHist
Stats::Histogram m_outstandReqHist
Histogram for number of outstanding requests per cycle.
Definition: GPUCoalescer.hh:348

GPUCoalescer::m_missTypeMachLatencyHist
std::vector< std::vector< Stats::Histogram * > > m_missTypeMachLatencyHist
Definition: GPUCoalescer.hh:362

GPUCoalescer::deadlockCheckEvent
GPUCoalescerWakeupEvent deadlockCheckEvent
Definition: GPUCoalescer.hh:323

RubySystem.hh

RubyPort::m_usingRubyTester
bool m_usingRubyTester
Definition: RubyPort.hh:192

MipsISA::r
r
Definition: pra_constants.hh:97

getOffset
Addr getOffset(Addr addr)
Definition: Address.cc:106

MemObject::Params
MemObjectParams Params
Definition: mem_object.hh:63

ArmISA::ss
Bitfield< 21 > ss
Definition: miscregs.hh:1371

GPUCoalescer::completeHitCallback
void completeHitCallback(std::vector< PacketPtr > &mylist, int len)
Definition: GPUCoalescer.cc:1176

GPUCoalescer::recordRequestType
void recordRequestType(SequencerRequestType requestType)
Definition: GPUCoalescer.cc:994

Request::isInstFetch
bool isInstFetch() const
Definition: request.hh:769

GPUCoalescer::m_InitialToForwardDelayHist
std::vector< Stats::Histogram * > m_InitialToForwardDelayHist
Definition: GPUCoalescer.hh:366

Request::isWavefrontScope
bool isWavefrontScope() const
Definition: request.hh:801

microldstop.hh

Packet::isRead
bool isRead() const
Definition: packet.hh:502

RubyPort::ruby_hit_callback
void ruby_hit_callback(PacketPtr pkt)
Definition: RubyPort.cc:362

GPUCoalescer::checkCoherence
void checkCoherence(Addr address)
Definition: GPUCoalescer.cc:986

Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142

GPUCoalescer::m_load_waiting_on_load_cycles
int m_load_waiting_on_load_cycles
Definition: GPUCoalescer.hh:304

safe_cast
T safe_cast(U ptr)
Definition: cast.hh:61

Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:245

GPUCoalescer::hitCallback
void hitCallback(GPUCoalescerRequest *request, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion)
Definition: GPUCoalescer.cc:593

makeLineAddress
Addr makeLineAddress(Addr addr)
Definition: Address.cc:112

GPUCoalescer::kernelEndList
std::unordered_map< int, PacketPtr > kernelEndList
Definition: GPUCoalescer.hh:298

printAddress
std::string printAddress(Addr addr)
Definition: Address.cc:126

Stats::DistBase::reset
void reset()
Reset stat value to default.
Definition: statistics.hh:1893

GPUCoalescer::GPU_StMiss
Stats::Scalar GPU_StMiss
Definition: GPUCoalescer.hh:335

GPUCoalescer::m_IssueToInitialDelayHist
std::vector< Stats::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
Definition: GPUCoalescer.hh:365

GPUCoalescer::CP_TCPLdHits
Stats::Scalar CP_TCPLdHits
Definition: GPUCoalescer.hh:337

Packet::isAtomicOp
bool isAtomicOp() const
Definition: packet.hh:671

AbstractController::blockOnQueue
void blockOnQueue(Addr, MessageBuffer *)
Definition: AbstractController.cc:192

ArmISA::j
Bitfield< 24 > j
Definition: miscregs.hh:1369

M5_VAR_USED
static const int NumArgumentRegs M5_VAR_USED
Definition: process.cc:83

RubyRequest.hh

GPUCoalescer::regStats
void regStats()
Register statistics for this object.
Definition: GPUCoalescer.cc:1281

Request::getFlags
Flags getFlags()
Accessor for flags.
Definition: request.hh:584

GPUCoalescer::m_typeLatencyHist
std::vector< Stats::Histogram * > m_typeLatencyHist
Definition: GPUCoalescer.hh:352

RubyPort::m_mandatory_q_ptr
MessageBuffer * m_mandatory_q_ptr
Definition: RubyPort.hh:191

RubyTester::SenderState
Definition: RubyTester.hh:85

Stats::DataWrap::name
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:254

GPUCoalescer::recordCPReadCallBack
void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
Definition: GPUCoalescer.cc:1148

DataBlock::getData
const uint8_t * getData(int offset, int len) const
Definition: DataBlock.cc:95

Clocked::clockPeriod
Tick clockPeriod() const
Definition: clocked_object.hh:213

X86ISA::type
type
Definition: misc.hh:728

Request::isReadonlySegment
bool isReadonlySegment() const
Definition: request.hh:856

RubyPort::testDrainComplete
void testDrainComplete()
Definition: RubyPort.cc:411

Request::isKernel
bool isKernel() const
Definition: request.hh:781

SimObject::name
virtual const std::string name() const
Definition: sim_object.hh:117

GPUCoalescer.hh

GPUCoalescer::mapAddrToPkt
PacketPtr mapAddrToPkt(Addr address)
Definition: GPUCoalescer.cc:1194

GPUCoalescer::newKernelEnds
std::vector< int > newKernelEnds
Definition: GPUCoalescer.hh:299

DataBlock::clear
void clear()
Definition: DataBlock.cc:50

GPUCoalescer::removeRequest
void removeRequest(GPUCoalescerRequest *request)
Definition: GPUCoalescer.cc:389

packet.hh
Declaration of the Packet class.

RequestDesc::secondaryType
RubyRequestType secondaryType
Definition: GPUCoalescer.hh:91

GPUCoalescer::print
void print(std::ostream &out) const
Definition: GPUCoalescer.cc:973

Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:454

Event
Definition: eventq.hh:185

GPUCoalescer::CP_TCPStHits
Stats::Scalar CP_TCPStHits
Definition: GPUCoalescer.hh:342

GPUCoalescer::m_load_waiting_on_store_cycles
int m_load_waiting_on_store_cycles
Definition: GPUCoalescer.hh:303

Request::isGroupSegment
bool isGroupSegment() const
Definition: request.hh:838

Request::isScoped
bool isScoped() const
Accessor functions for the memory space configuration flags and used by GPU ISAs such as the Heteroge...
Definition: request.hh:798

CacheMemory::setLocked
void setLocked(Addr addr, int context)
Definition: CacheMemory.cc:456

GPUCoalescer::m_writeRequestTable
RequestTable m_writeRequestTable
Definition: GPUCoalescer.hh:293

ArmISA::len
Bitfield< 18, 16 > len
Definition: miscregs.hh:1626

GPUCoalescer::issueEvent
IssueEvent issueEvent
Definition: GPUCoalescer.hh:268

EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:728

GPUCoalescer::m_instCache_ptr
CacheMemory * m_instCache_ptr
Definition: GPUCoalescer.hh:277

GPUCoalescer::resetStats
void resetStats()
Reset statistics associated with this object.
Definition: GPUCoalescer.cc:206

RequestDesc::pkt
PacketPtr pkt
Definition: GPUCoalescer.hh:89

GPUCoalescer::m_outstanding_count
int m_outstanding_count
Definition: GPUCoalescer.hh:296

DataBlock::setData
void setData(const uint8_t *data, int offset, int len)
Definition: DataBlock.cc:108

GPUCoalescer::IssueEvent::process
void process()
Definition: GPUCoalescer.cc:1045

GPUCoalescer::m_data_cache_hit_latency
Cycles m_data_cache_hit_latency
Definition: GPUCoalescer.hh:282

GPUCoalescer::handleLlsc
bool handleLlsc(Addr address, GPUCoalescerRequest *request)
Definition: GPUCoalescer.cc:411

pc
IntReg pc
Definition: remote_gdb.hh:91

Stats::DataWrap::desc
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:287

MessageBuffer.hh

reqScopeToHSAScope
HSAScope reqScopeToHSAScope(Request *req)
Definition: GPUCoalescer.cc:73

GPUCoalescer::printProgress
void printProgress(std::ostream &out) const
Definition: GPUCoalescer.cc:229

AbstractController::unblock
void unblock(Addr)
Definition: AbstractController.cc:205

MachineID
Definition: MachineID.hh:38

Packet::getAtomicOp
AtomicOpFunctor * getAtomicOp() const
Accessor function to atomic op.
Definition: packet.hh:670

Packet::getSize
unsigned getSize() const
Definition: packet.hh:649

fatal_if
fatal_if(p->js_features.size() > 16,"Too many job slot feature registers specified (%i)\n", p->js_features.size())

machineIDToMachineType
MachineType machineIDToMachineType(MachineID machID)
Definition: RubySlicc_ComponentMapping.hh:109

GPUCoalescer::m_runningGarnetStandalone
bool m_runningGarnetStandalone
Definition: GPUCoalescer.hh:306

MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:325

GPUCoalescer::m_FirstResponseToCompletionDelayHist
std::vector< Stats::Histogram * > m_FirstResponseToCompletionDelayHist
Definition: GPUCoalescer.hh:368

GPUCoalescer::m_missTypeLatencyHist
std::vector< Stats::Histogram * > m_missTypeLatencyHist
Definition: GPUCoalescer.hh:357

GPUCoalescer::m_store_waiting_on_store_cycles
int m_store_waiting_on_store_cycles
Definition: GPUCoalescer.hh:302

shader.hh

RubySystem::getBlockSizeBytes
static uint32_t getBlockSizeBytes()
Definition: RubySystem.hh:74

RubyPort
Definition: RubyPort.hh:57

MessageBuffer::enqueue
void enqueue(MsgPtr message, Tick curTime, Tick delta)
Definition: MessageBuffer.cc:143

ClockedObject::regStats
void regStats() override
Register statistics for this object.
Definition: clocked_object.cc:147

GPUCoalescer::evictionCallback
void evictionCallback(Addr address)
Definition: GPUCoalescer.cc:1057

GPUCoalescerRequest::issue_time
Cycles issue_time
Definition: GPUCoalescer.hh:68

GPUCoalescer::GPU_TCPStTransfers
Stats::Scalar GPU_TCPStTransfers
Definition: GPUCoalescer.hh:333

Request::isGlobalSegment
bool isGlobalSegment() const
Definition: request.hh:829

Stats::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1869

RequestDesc
Definition: GPUCoalescer.hh:76

Request::isSystemScope
bool isSystemScope() const
Definition: request.hh:822

Packet::getAddr
Addr getAddr() const
Definition: packet.hh:639

CacheMemory::isTagPresent
bool isTagPresent(Addr address) const
Definition: CacheMemory.cc:213

Packet::isFlush
bool isFlush() const
Definition: packet.hh:530

RubyPort::SenderState::port
MemSlavePort * port
Definition: RubyPort.hh:141

DPRINTFR
#define DPRINTFR(...)
Definition: trace.hh:214

GPUCoalescer::CP_TCCLdHits
Stats::Scalar CP_TCCLdHits
Definition: GPUCoalescer.hh:339

GPUCoalescerRequest
Definition: GPUCoalescer.hh:64