O3 Cpu Fetch

Posted May 27, 2021

By Jaehyuk Lee 66 min read

Fetch

        
      
template <class Impl>
void
DefaultFetch<Impl>::tick()
{
   list<ThreadID>::iterator threads = activeThreads->begin();
   list<ThreadID>::iterator end = activeThreads->end();
   bool status_change = false;

   wroteToTimeBuffer = false;

   for (ThreadID i = 0; i < numThreads; ++i) {
       issuePipelinedIfetch[i] = false;
   }

   while (threads != end) {
       ThreadID tid = *threads++;

       // Check the signals for each thread to determine the proper status
       // for each thread.
       bool updated_status = checkSignalsAndUpdate(tid);
       status_change =  status_change || updated_status;
   }

   DPRINTF(Fetch, "Running stage.\n");

   if (FullSystem) {
       if (fromCommit->commitInfo[0].interruptPending) {
           interruptPending = true;
       }

       if (fromCommit->commitInfo[0].clearInterrupt) {
           interruptPending = false;
       }
   }

   for (threadFetched = 0; threadFetched < numFetchingThreads;
        threadFetched++) {
       // Fetch each of the actively fetching threads.
       fetch(status_change);
   }

   // Record number of instructions fetched this cycle for distribution.
   fetchNisnDist.sample(numInst);

   if (status_change) {
       // Change the fetch stage status if there was a status change.
       _status = updateFetchStatus();
   }

   // Issue the next I-cache request if possible.
   for (ThreadID i = 0; i < numThreads; ++i) {
       if (issuePipelinedIfetch[i]) {
           pipelineIcacheAccesses(i);
       }
   }

   // Send instructions enqueued into the fetch queue to decode.
   // Limit rate by fetchWidth.  Stall if decode is stalled.
   unsigned insts_to_decode = 0;
   unsigned available_insts = 0;

   for (auto tid : *activeThreads) {
       if (!stalls[tid].decode) {
           available_insts += fetchQueue[tid].size();
       }
   }

   // Pick a random thread to start trying to grab instructions from
   auto tid_itr = activeThreads->begin();
   std::advance(tid_itr, random_mt.random<uint8_t>(0, activeThreads->size() - 1));

   while (available_insts != 0 && insts_to_decode < decodeWidth) {
       ThreadID tid = *tid_itr;
       if (!stalls[tid].decode && !fetchQueue[tid].empty()) {
           const auto& inst = fetchQueue[tid].front();
           toDecode->insts[toDecode->size++] = inst;
           DPRINTF(Fetch, "[tid:%i] [sn:%llu] Sending instruction to decode "
                   "from fetch queue. Fetch queue size: %i.\n",
                   tid, inst->seqNum, fetchQueue[tid].size());

           wroteToTimeBuffer = true;
           fetchQueue[tid].pop_front();
           insts_to_decode++;
           available_insts--;
       }

       tid_itr++;
       // Wrap around if at end of active threads list
       if (tid_itr == activeThreads->end())
           tid_itr = activeThreads->begin();
   }

   // If there was activity this cycle, inform the CPU of it.
   if (wroteToTimeBuffer) {
       DPRINTF(Activity, "Activity this cycle.\n");
       cpu->activityThisCycle();
   }

   // Reset the number of the instruction we've fetched.
   numInst = 0;
}

fetch: resolving TLB and cache accesses to actually fetches instructions

        
      
void
DefaultFetch<Impl>::fetch(bool &status_change)
{
   //////////////////////////////////////////
   // Start actual fetch
   //////////////////////////////////////////
   ThreadID tid = getFetchingThread();

   assert(!cpu->switchedOut());

   if (tid == InvalidThreadID) {
       // Breaks looping condition in tick()
       threadFetched = numFetchingThreads;

       if (numThreads == 1) {  // @todo Per-thread stats
           profileStall(0);
       }

       return;
   }

   DPRINTF(Fetch, "Attempting to fetch from [tid:%i]\n", tid);

   // The current PC.
   TheISA::PCState thisPC = pc[tid];

   Addr pcOffset = fetchOffset[tid];
   Addr fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask;

   bool inRom = isRomMicroPC(thisPC.microPC());

   // If returning from the delay of a cache miss, then update the status
   // to running, otherwise do the cache access.  Possibly move this up
   // to tick() function.
   if (fetchStatus[tid] == IcacheAccessComplete) {
       DPRINTF(Fetch, "[tid:%i] Icache miss is complete.\n", tid);

       fetchStatus[tid] = Running;
       status_change = true;
   } else if (fetchStatus[tid] == Running) {
       // Align the fetch PC so its at the start of a fetch buffer segment.
       Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr);

       // If buffer is no longer valid or fetchAddr has moved to point
       // to the next cache block, AND we have no remaining ucode
       // from a macro-op, then start fetch from icache.
       if (!(fetchBufferValid[tid] && fetchBufferBlockPC == fetchBufferPC[tid])
           && !inRom && !macroop[tid]) {
           DPRINTF(Fetch, "[tid:%i] Attempting to translate and read "
                   "instruction, starting at PC %s.\n", tid, thisPC);

           fetchCacheLine(fetchAddr, tid, thisPC.instAddr());

           if (fetchStatus[tid] == IcacheWaitResponse)
               ++icacheStallCycles;
           else if (fetchStatus[tid] == ItlbWait)
               ++fetchTlbCycles;
           else
               ++fetchMiscStallCycles;
           return;
       } else if ((checkInterrupt(thisPC.instAddr()) && !delayedCommit[tid])) {
           // Stall CPU if an interrupt is posted and we're not issuing
           // an delayed commit micro-op currently (delayed commit instructions
           // are not interruptable by interrupts, only faults)
           ++fetchMiscStallCycles;
           DPRINTF(Fetch, "[tid:%i] Fetch is stalled!\n", tid);
           return;
       }
   } else {
       if (fetchStatus[tid] == Idle) {
           ++fetchIdleCycles;
           DPRINTF(Fetch, "[tid:%i] Fetch is idle!\n", tid);
       }

       // Status is Idle, so fetch should do nothing.
       return;
   }
......
}

The fetch function is pretty complex and long function to analyze at once. Therefore, we will divide the fetch function in two main parts to understand entire logic of the O3CPU’s fetch stage. The first main part will explain how the fetch stage generate request to ITLB and ICache to resolve virtual to physical address translation and access the cache using the translated address. After the fetch stage receive the instructions from the ICache, the remaining part will prepare the data structure that will be passed to the next stage, decode. Let’s take a look at how the fetch function retrieve the instructions first.

First part of the fetch: ITLB to ICache access.

getFetchingThread: selecting thread to let it fetch

If there are multiple threads need to fetch next instructions, the processor should select one among them to continue fetching. Based on the policy adopted by the processor, it can return different thread based on the current status of threads.

        
      
///////////////////////////////////////
//                                   //
//  SMT FETCH POLICY MAINTAINED HERE //
//                                   //
///////////////////////////////////////
template<class Impl>
ThreadID
DefaultFetch<Impl>::getFetchingThread()
{
   if (numThreads > 1) {
       switch (fetchPolicy) {
         case FetchPolicy::RoundRobin:
           return roundRobin();
         case FetchPolicy::IQCount:
           return iqCount();
         case FetchPolicy::LSQCount:
           return lsqCount();
         case FetchPolicy::Branch:
           return branchCount();
         default:
           return InvalidThreadID;
       }
   } else {
       list<ThreadID>::iterator thread = activeThreads->begin();
       if (thread == activeThreads->end()) {
           return InvalidThreadID;
       }

       ThreadID tid = *thread;

       if (fetchStatus[tid] == Running ||
           fetchStatus[tid] == IcacheAccessComplete ||
           fetchStatus[tid] == Idle) {
           return tid;
       } else {
           return InvalidThreadID;
       }
   }
}

Translating virtual to physical address using I-TLB

        
      
template <class Impl>
bool
DefaultFetch<Impl>::fetchCacheLine(Addr vaddr, ThreadID tid, Addr pc)
{   
   Fault fault = NoFault;
   
   assert(!cpu->switchedOut());
   
   // @todo: not sure if these should block translation.
   //AlphaDep
   if (cacheBlocked) {
       DPRINTF(Fetch, "[tid:%i] Can't fetch cache line, cache blocked\n",
               tid);
       return false;
   } else if (checkInterrupt(pc) && !delayedCommit[tid]) {
       // Hold off fetch from getting new instructions when:
       // Cache is blocked, or
       // while an interrupt is pending and we're not in PAL mode, or
       // fetch is switched out.
       DPRINTF(Fetch, "[tid:%i] Can't fetch cache line, interrupt pending\n",
               tid);
       return false;
   }
   
   // Align the fetch address to the start of a fetch buffer segment.
   Addr fetchBufferBlockPC = fetchBufferAlignPC(vaddr);
   
   DPRINTF(Fetch, "[tid:%i] Fetching cache line %#x for addr %#x\n",
           tid, fetchBufferBlockPC, vaddr);
   
   // Setup the memReq to do a read of the first instruction's address.
   // Set the appropriate read size and flags as well.
   // Build request here.
   RequestPtr mem_req = std::make_shared<Request>(
       tid, fetchBufferBlockPC, fetchBufferSize, 
       Request::INST_FETCH, cpu->instMasterId(), pc,
       cpu->thread[tid]->contextId());
   
   mem_req->taskId(cpu->taskId());
   
   memReq[tid] = mem_req;
   
   // Initiate translation of the icache block
   fetchStatus[tid] = ItlbWait;
   FetchTranslation *trans = new FetchTranslation(this);
   cpu->itb->translateTiming(mem_req, cpu->thread[tid]->getTC(),
                             trans, BaseTLB::Execute);
   return true;
}

One can ask how the fetch stage can understand when the translation is finished. Note that FetchTranslation object is instantiated and sent to the Instruction TLB (itb) which conveys functions that should be invoked after the Translation is resolved. Therefore, when the instruction TLB finishes the translation, it invokes the function provided by the passed FetchTranslation object and let the fetch stage to process next step, initiating the cache access. Anyway, let’s take a look at which function is provided to the TLB.

gem5/src/cpu/o3/fetch.hh

        
      
   class FetchTranslation : public BaseTLB::Translation
   {
     protected:
       DefaultFetch<Impl> *fetch;

     public:
       FetchTranslation(DefaultFetch<Impl> *_fetch)
           : fetch(_fetch)
       {}

       void
       markDelayed()
       {}

       void
       finish(const Fault &fault, const RequestPtr &req, ThreadContext *tc,
              BaseTLB::Mode mode)
       {
           assert(mode == BaseTLB::Execute);
           fetch->finishTranslation(fault, req);
           delete this;
       }
   };

You might remember that the TLB invokes the finish function at the end of the translation Yes the FetchTranslation object provide the finish function. When the TLB finishes translation, by invoking finish function, it can let the processor know the translation is resolved. The finish function further invokes the finishTranslation function defined in the DefaultFetch class.

finishTranslation: finishing TLB access and generate cache access

After the request to the TLB has been resolved, the remaining job is accessing the cache to read the instruction to fetch. Let’s take a look at how the fetch stage of the O3 CPU access the instruction cache.

        
      
template <class Impl>
void
DefaultFetch<Impl>::finishTranslation(const Fault &fault,
                                     const RequestPtr &mem_req)
{
   ThreadID tid = cpu->contextToThread(mem_req->contextId());
   Addr fetchBufferBlockPC = mem_req->getVaddr();

   assert(!cpu->switchedOut());

   // Wake up CPU if it was idle
   cpu->wakeCPU();

   if (fetchStatus[tid] != ItlbWait || mem_req != memReq[tid] ||
       mem_req->getVaddr() != memReq[tid]->getVaddr()) {
       DPRINTF(Fetch, "[tid:%i] Ignoring itlb completed after squash... fetchStatus:%d\n",
               tid,fetchStatus[tid]);
       ++fetchTlbSquashes;
       return;
   }

Compared to simple processor which doesn’t provide speculative execution, O3 processor utilize the branch prediction and out-of-order execution. Therefore, if the current TLB completion is notified to the O3CPU because of a misspeculation, it should drop the TLB response and stop accessing the cache. Note that the speculation can turn out to be false while it waits TLB response. Line 665-670 checks the misspeculation.

        
      
   // If translation was successful, attempt to read the icache block.
   if (fault == NoFault) {
       // Check that we're not going off into random memory
       // If we have, just wait around for commit to squash something and put
       // us on the right track
       if (!cpu->system->isMemAddr(mem_req->getPaddr())) {
           warn("Address %#x is outside of physical memory, stopping fetch\n",
                   mem_req->getPaddr());
           fetchStatus[tid] = NoGoodAddr;
           memReq[tid] = NULL;
           return;
       }

       // Build packet here to access the Icache.
       PacketPtr data_pkt = new Packet(mem_req, MemCmd::ReadReq);
       data_pkt->dataDynamic(new uint8_t[fetchBufferSize]);

       fetchBufferPC[tid] = fetchBufferBlockPC;
       fetchBufferValid[tid] = false;
       DPRINTF(Fetch, "Fetch: Doing instruction read.\n");

       fetchedCacheLines++;
 696
       // Access the cache.
       if (!icachePort.sendTimingReq(data_pkt)) {
           assert(retryPkt == NULL);
           assert(retryTid == InvalidThreadID);
           DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);

           fetchStatus[tid] = IcacheWaitRetry;
           retryPkt = data_pkt;
           retryTid = tid;
           cacheBlocked = true;
       } else {
           DPRINTF(Fetch, "[tid:%i] Doing Icache access.\n", tid);
           DPRINTF(Activity, "[tid:%i] Activity: Waiting on I-cache "
                   "response.\n", tid);
           lastIcacheStall[tid] = curTick();
           fetchStatus[tid] = IcacheWaitResponse;
           // Notify Fetch Request probe when a packet containing a fetch
           // request is successfully sent
           ppFetchRequestSent->notify(mem_req);
       }
   } else {

If the current TLB resolution response is valid and speculated successfully, it should generate read request packet and send it to the Instruction Cache. Line 687-695 builds the packet and send buffer to be used for containing instructions read from the cache. When the cache access request cannot be sent to the instruction cache (line 698-707) because of the cache is busy for handling previous requests, it should retry when the Instruction cache is available later. Based on the line 701, we can guess that the cache supports multiple cache accesses simultaneously, but the request can exceed the capacity of its simultaneous processing. We will see whether the GEM5 supports blocking cache access or non-blocking cache accesses in another posting. Anyway when the retry is required, it memorizes the request packet and tid. Also it changes current status as IcacheWaitRetry. When the Instruction cache is available to process the request (line 708-716), it sets current status as IcacheWaitResponse and waits until the Instruction cache resolves the request and send the actual instructions.

        
      
   } else {
       // Don't send an instruction to decode if we can't handle it.
       if (!(numInst < fetchWidth) || !(fetchQueue[tid].size() < fetchQueueSize)) {
           assert(!finishTranslationEvent.scheduled());
           finishTranslationEvent.setFault(fault);
           finishTranslationEvent.setReq(mem_req);
           cpu->schedule(finishTranslationEvent,
                         cpu->clockEdge(Cycles(1)));
           return;
       }
       DPRINTF(Fetch, "[tid:%i] Got back req with addr %#x but expected %#x\n",
               tid, mem_req->getVaddr(), memReq[tid]->getVaddr());
       // Translation faulted, icache request won't be sent.
       memReq[tid] = NULL;

       // Send the fault to commit.  This thread will not do anything
       // until commit handles the fault.  The only other way it can
       // wake up is if a squash comes along and changes the PC.
       TheISA::PCState fetchPC = pc[tid];

       DPRINTF(Fetch, "[tid:%i] Translation faulted, building noop.\n", tid);
       // We will use a nop in ordier to carry the fault.
       DynInstPtr instruction = buildInst(tid, StaticInst::nopStaticInstPtr,
                                          NULL, fetchPC, fetchPC, false);
       instruction->setNotAnInst();

       instruction->setPredTarg(fetchPC);
       instruction->fault = fault;
       wroteToTimeBuffer = true;

       DPRINTF(Activity, "Activity this cycle.\n");
       cpu->activityThisCycle();

       fetchStatus[tid] = TrapPending;

       DPRINTF(Fetch, "[tid:%i] Blocked, need to handle the trap.\n", tid);
       DPRINTF(Fetch, "[tid:%i] fault (%s) detected @ PC %s.\n",
               tid, fault->name(), pc[tid]);
   }
   _status = updateFetchStatus();
}

When the TLB translation emits fault instead of successful translation, it should be handled based on the reason of the fault. When the fetchQeueue is already full or XXX (line 719-726), instead of issuing cache access, it postpone the operation to later by scheduling the finishTranslationEvent. Note that the request packet received from the ITLB and fault structure is also included in the finishTranslationEvent to process it later.

        
      
   /* Event to delay delivery of a fetch translation result in case of
    * a fault and the nop to carry the fault cannot be generated
    * immediately */
   class FinishTranslationEvent : public Event
   { 
     private:
       DefaultFetch<Impl> *fetch;
       Fault fault;
       RequestPtr req;
     
     public:
       FinishTranslationEvent(DefaultFetch<Impl> *_fetch)
           : fetch(_fetch), req(nullptr)
       {}
       
       void setFault(Fault _fault)
       {   
           fault = _fault;
       }
       
       void setReq(const RequestPtr &_req)
       {   
           req = _req;
       }
       
       /** Process the delayed finish translation */
       void process()
       {   
           assert(fetch->numInst < fetch->fetchWidth);
           fetch->finishTranslation(fault, req);
       }
       
       const char *description() const
       {   
           return "FullO3CPU FetchFinishTranslation";
       }   
     };

In detail, when the FinishTranslationEvent happens after the designated cycles passed, it invokes the process function defined in the class. As shown in the above code line 166-170, it calls finishTranslation with the passed fault and request again.

For the other reason of faults, \TODO{explanation required for the rest of the faulting code}. After the fetch stage handles the response from the ITLB, it should update the current status of the fetch stage by invoking the updateFetchStatus function.

        
      
template<class Impl>
typename DefaultFetch<Impl>::FetchStatus
DefaultFetch<Impl>::updateFetchStatus()
{
   //Check Running
   list<ThreadID>::iterator threads = activeThreads->begin();
   list<ThreadID>::iterator end = activeThreads->end();

   while (threads != end) {
       ThreadID tid = *threads++;

       if (fetchStatus[tid] == Running ||
           fetchStatus[tid] == Squashing ||
           fetchStatus[tid] == IcacheAccessComplete) {

           if (_status == Inactive) {
               DPRINTF(Activity, "[tid:%i] Activating stage.\n",tid);

               if (fetchStatus[tid] == IcacheAccessComplete) {
                   DPRINTF(Activity, "[tid:%i] Activating fetch due to cache"
                           "completion\n",tid);
               }

               cpu->activateStage(O3CPU::FetchIdx);
           }

           return Active;
       }
   }

   // Stage is switching from active to inactive, notify CPU of it.
   if (_status == Active) {
       DPRINTF(Activity, "Deactivating stage.\n");

       cpu->deactivateStage(O3CPU::FetchIdx);
   }

   return Inactive;
}

processCacheCompletion: completing ICache access

When the sendTimingReq is invoked through the icachePort, which means cache access request sent to the Instruction cache successfully, after few cycles elapsed, the O3CPU will be notified that the cache read completes. The cache access completion is handled by the recvTimingResp of the IcachePort allocated for the O3CPU.

        
      
template<class Impl>
bool
DefaultFetch<Impl>::IcachePort::recvTimingResp(PacketPtr pkt)
{
   DPRINTF(O3CPU, "Fetch unit received timing\n");
   // We shouldn't ever get a cacheable block in Modified state
   assert(pkt->req->isUncacheable() ||
          !(pkt->cacheResponding() && !pkt->hasSharers()));
   fetch->processCacheCompletion(pkt);

   return true;
}

When it receives the instructions from the cache, it invokes the processCacheCompletion function and ask this function to handle the response arrived from the cache.

        
      
DefaultFetch<Impl>::processCacheCompletion(PacketPtr pkt)
{
   ThreadID tid = cpu->contextToThread(pkt->req->contextId());

   DPRINTF(Fetch, "[tid:%i] Waking up from cache miss.\n", tid);
   assert(!cpu->switchedOut());

   // Only change the status if it's still waiting on the icache access
   // to return.
   if (fetchStatus[tid] != IcacheWaitResponse ||
       pkt->req != memReq[tid]) {
       ++fetchIcacheSquashes;
       delete pkt;
       return;
   }

   memcpy(fetchBuffer[tid], pkt->getConstPtr<uint8_t>(), fetchBufferSize);
   fetchBufferValid[tid] = true;

   // Wake up the CPU (if it went to sleep and was waiting on
   // this completion event).
   cpu->wakeCPU();

   DPRINTF(Activity, "[tid:%i] Activating fetch due to cache completion\n",
           tid);

   switchToActive();

   // Only switch to IcacheAccessComplete if we're not stalled as well.
   if (checkStall(tid)) {
       fetchStatus[tid] = Blocked;
   } else {
       fetchStatus[tid] = IcacheAccessComplete;
   }

   pkt->req->setAccessLatency();
   cpu->ppInstAccessComplete->notify(pkt);
   // Reset the mem req to NULL.
   delete pkt;
   memReq[tid] = NULL;
}

When the instructions from the cache arrives, it could be the case where the misspeculation had initiated the cache access. In that case, it should drop the cache access by deleting the response packet. In other cases, the read instructions should be copied from the packet to the fetchBuffer containing the fetched instructions (line 405-406). When the current tid is stalled because of some events (we will cover which condition makes the thread to be stalled), it should be blocked until the stall is resolved. If there is no stall, then the fetchStatus can be changed to IcacheAccessComplete, which means the thread can finish the fetch stage. Now let’s go back to the fetch function again!

Revisiting fetch stage to handle the instructions fetched from the cache

Fetch tick happens every processor tick

One important thing to note is that Fetch stage is always executed at every clock cycle. However, based on the current status of the processor and other components such as TLB and cache, fetch stage cannot produce meaningful progress and should wait until the other component finish their operations. Although modern processors have multiple cores to execute, but if the all cores are waiting the cache accesses, no other hardware thread cannot execute the fetch stage. The getFetchingThread function checks the status of the all hardware threads and returns thread if there is one that can execute the fetch stage.

        
      
template<class Impl>
void
DefaultFetch<Impl>::fetch(bool &status_change)
{
   //////////////////////////////////////////
   // Start actual fetch
   //////////////////////////////////////////
   ThreadID tid = getFetchingThread();

   assert(!cpu->switchedOut());

   if (tid == InvalidThreadID) {
       // Breaks looping condition in tick()
       threadFetched = numFetchingThreads;

       if (numThreads == 1) {  // @todo Per-thread stats
           profileStall(0);
       }

       return;
   }

As shown in the above code, when there is no available hardware thread to execute fetch stage, getFetchingThread returns InvalidThreadID, and no thread can produce progress at that clock cycle. Only the case where the getFetchingThread returns an available thread is the thread is in one of the three fetchStatus: Running, IcacheAccessComplete, or Idle.

system.cpu.fetch: Running stage.
system.cpu.fetch: Attempting to fetch from [tid:0]
system.cpu.fetch: [tid:0] Attempting to translate and read instruction, starting at PC (0x7ffff8000090=>0x7ffff8000098).(0=>1).
system.cpu.fetch: [tid:0] Fetching cache line 0x7ffff8000080 for addr 0x7ffff8000090
system.cpu.fetch: Fetch: Doing instruction read.
system.cpu.fetch: [tid:0] Doing Icache access.
system.cpu.fetch: Running stage.
system.cpu.fetch: There are no more threads available to fetch from.
system.cpu.fetch: [tid:0] Fetch is waiting cache response!
system.cpu.fetch: Running stage.
system.cpu.fetch: There are no more threads available to fetch from.
system.cpu.fetch: [tid:0] Fetch is waiting cache response!
system.cpu.fetch: Running stage.
system.cpu.fetch: There are no more threads available to fetch from.
system.cpu.fetch: [tid:0] Fetch is waiting cache response!
system.cpu.fetch: Running stage.
system.cpu.fetch: There are no more threads available to fetch from.
system.cpu.fetch: [tid:0] Fetch is waiting cache response!
system.cpu.fetch: Running stage.
system.cpu.fetch: There are no more threads available to fetch from.
system.cpu.fetch: [tid:0] Fetch is waiting cache response!
system.cpu.fetch: Running stage.
system.cpu.fetch: There are no more threads available to fetch from.
system.cpu.fetch: [tid:0] Fetch is waiting cache response!
system.cpu.fetch: Running stage.
system.cpu.fetch: There are no more threads available to fetch from.
system.cpu.fetch: [tid:0] Fetch is waiting cache response!
system.cpu.fetch: Running stage.
system.cpu.fetch: There are no more threads available to fetch from.
system.cpu.fetch: [tid:0] Fetch is waiting cache response!
system.cpu.fetch: Running stage.
system.cpu.fetch: There are no more threads available to fetch from.
system.cpu.fetch: [tid:0] Fetch is waiting cache response!
system.cpu.fetch: [tid:0] Waking up from cache miss.
system.cpu.fetch: [tid:0] Waking up from cache miss.
system.cpu.fetch: Running stage.
system.cpu.fetch: Attempting to fetch from [tid:0]
system.cpu.fetch: [tid:0] Icache miss is complete.

In our current system, because we only have one hardware thread, while it waits for the ICache miss to be resolved, it cannot execute fetch stage to produce further progress. The described behavior of the fetch stage is described in the above log. After the thread first fetches the instructions at cycle 1000, it cannot produce any progress until the ICache miss is resolved at cycle 78000. After the ICache miss is resolved (after 78500 cycle), it can finally produce progress from the fetch stage. Remember that when a missed ICache is resolved by the processCacheCompletion function, it changes the fetchStatus of the thread from IcacheWaitResponse to IcacheAccessComplete. Therefore, when the fetch stage is executed once again, the undiscovered path will be executed.

        
      
   // If returning from the delay of a cache miss, then update the status
   // to running, otherwise do the cache access.  Possibly move this up
   // to tick() function.
   if (fetchStatus[tid] == IcacheAccessComplete) {
       DPRINTF(Fetch, "[tid:%i] Icache miss is complete.\n", tid);

       fetchStatus[tid] = Running;
       status_change = true;
   } else if (fetchStatus[tid] == Running) {

Compared to the initial fetch execution that initiated the ITLB and ICache accesses, because the fetchStatus has been changed to IcacheAccessComplete, the fetch stage can execute the rest of the fetch function at this moment. Let’s take a look at the rest of the fetch function in detail.

fetchBuffer contains actual instructions for a particular hardware thread

        
      
   //when a requested instruction cache block is arrived(IcacheAccessComplete)
   ++fetchCycles;

   TheISA::PCState nextPC = thisPC;

   StaticInstPtr staticInst = NULL;
   StaticInstPtr curMacroop = macroop[tid];

   // If the read of the first instruction was successful, then grab the
   // instructions from the rest of the cache line and put them into the
   // queue heading to decode.

   DPRINTF(Fetch, "[tid:%i] Adding instructions to queue to "
           "decode.\n", tid);

   // Need to keep track of whether or not a predicted branch
   // ended this fetch block.
   bool predictedBranch = false;

   // Need to halt fetch if quiesce instruction detected
   bool quiesce = false;

   TheISA::MachInst *cacheInsts =
       reinterpret_cast<TheISA::MachInst *>(fetchBuffer[tid]);
1259
   const unsigned numInsts = fetchBufferSize / instSize;
   unsigned blkOffset = (fetchAddr - fetchBufferPC[tid]) / instSize;

Remember that the fetchBuffer[tid] contains the actual instructions read from the ICache. Note that cacheInsts variable which is the TheISA::MachInst * type references the instruction buffer, fetchBuffer[tid]. This variable is passed to the decoder to pass the instruction stream read from the ICache. Also, the TheISA::MachInst is a uint64_t in the x86 architecture (TheISA will be changed to the X86 namespace). Because X86 architecture adopts variable instruction length, it approximately set the instruction length as 8bytes and calculate the number of instructions in the instruction stream fetched from the ICache. Note that the numInsts is approximated as fetchBufferSize / instSize.

The main fetchloop processing instructions

        
      
   // Loop through instruction memory from the cache.
   // Keep issuing while fetchWidth is available and branch is not
   // predicted taken
   while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize
          && !predictedBranch && !quiesce) {
......
       // Re-evaluate whether the next instruction to fetch is in micro-op ROM
       // or not.
       inRom = isRomMicroPC(thisPC.microPC());
   }

The while loop (line 1266-1267) is the main body of processing instructions stored in the fetchBuffer. Be careful not to confuse numInst with numInsts. numInst means the number of instructions fetched at this cycle, and numInsts means the number of instructions that can possibly reside in the fetchBuffer. Also, fetchQueue is the CPP standard deque managing DynInstPtr which is the pointer of one macroop instruction. Therefore, the loop checks first whether the number of fetched instructions at this cycle exceed the deisgnated fetchWidth and examine whether the fetchQueue is overflowed, which means too many instructions have been fetched from the instruction cache. Because the instruction length can vary but the capacity of fetchQueue is limited, sometimes depending on which instructions actually reside in the fetched instruction cache, it cannot process all instructions at that cycle. Based on the fact that it checks if the fetchQueue is overflowed at every iteration, we can assume that the loop insert instruction to the fetchQueue. We will take a look at the details soon! Also it checks the type of the previous instruction handled by the loop, whether it is predictedBranch or quiesce. If the previous instruction turns out to one of these type of instruction, then the loop should not process the instruction in the fetchBuffer further and stop.

Decoder

        
      
       // We need to process more memory if we aren't going to get a
       // StaticInst from the rom, the current macroop, or what's already
       // in the decoder.
       bool needMem = !inRom && !curMacroop &&
           !decoder[tid]->instReady();
       fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask;
       Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr);

       if (needMem) {
           // If buffer is no longer valid or fetchAddr has moved to point
           // to the next cache block then start fetch from icache.
           if (!fetchBufferValid[tid] ||
               fetchBufferBlockPC != fetchBufferPC[tid])
               break;

           if (blkOffset >= numInsts) {
               // We need to process more memory, but we've run out of the
               // current block.
               break;
           }

           decoder[tid]->moreBytes(thisPC, fetchAddr, cacheInsts[blkOffset]);

           if (decoder[tid]->needMoreBytes()) {
               blkOffset++;
               fetchAddr += instSize;
               pcOffset += instSize;
           }
       }

After the all conditions are met, each iteration of the loop processes the instruction one by one. For the first execution of the fetch stage, the inRom and curMacroop are set as false and NULL respectively. Also, when the decoder object embedded in the fetch stage is initialized, the instDone variable of the decoder is set as false, which will be returned as the result of instReady function of the decoder. Therefore, the needMem should be set for the initial execution. When the needMem flag is set, which means \TODO{XXX}, it invokes moreBytes function of the decoder to decode the instruction.

        
      
   //Use this to give data to the decoder. This should be used
   //when there is control flow.
   void moreBytes(const PCState &pc, Addr fetchPC, MachInst data)
   {
       DPRINTF(Decoder, "Getting more bytes.\n");
       basePC = fetchPC;
       offset = (fetchPC >= pc.instAddr()) ? 0 : pc.instAddr() - fetchPC;
       fetchChunk = letoh(data);
       outOfBytes = false;
       process();
   }

        
      
Decoder::process()
{
   //This function drives the decoder state machine.

   //Some sanity checks. You shouldn't try to process more bytes if
   //there aren't any, and you shouldn't overwrite an already
   //decoder ExtMachInst.
   assert(!outOfBytes);
   assert(!instDone);

   if (state == ResetState)
       state = doResetState();
   if (state == FromCacheState) {
       state = doFromCacheState();
   } else {
       instBytes->chunks.push_back(fetchChunk);
   }

   //While there's still something to do...
   while (!instDone && !outOfBytes) {
       uint8_t nextByte = getNextByte();
       switch (state) {
         case PrefixState:
           state = doPrefixState(nextByte);
           break;
         case Vex2Of2State:
           state = doVex2Of2State(nextByte);
           break;
         case Vex2Of3State:
           state = doVex2Of3State(nextByte);
           break;
         case Vex3Of3State:
           state = doVex3Of3State(nextByte);
           break;
         case VexOpcodeState:
           state = doVexOpcodeState(nextByte);
           break;
         case OneByteOpcodeState:
           state = doOneByteOpcodeState(nextByte);
           break;
         case TwoByteOpcodeState:
           state = doTwoByteOpcodeState(nextByte);
           break;
         case ThreeByte0F38OpcodeState:
           state = doThreeByte0F38OpcodeState(nextByte);
           break;
         case ThreeByte0F3AOpcodeState:
           state = doThreeByte0F3AOpcodeState(nextByte);
           break;
         case ModRMState:
           state = doModRMState(nextByte);
           break;
         case SIBState:
           state = doSIBState(nextByte);
           break;
         case DisplacementState:
           state = doDisplacementState();
           break;
         case ImmediateState:
           state = doImmediateState();
           break;
         case ErrorState:
           panic("Went to the error state in the decoder.\n");
         default:
           panic("Unrecognized state! %d\n", state);
       }
   }
}

Based on the instruction format, different doXXX function will be invoked to parse the macroop instruction. First of all, it invokes doResetState for every macroop to initialize the variables representing the parsed instruction. Also it sets the origPC field as the PC address of the macroop instruction. After the initialization, based on the instruction format, it will invoke different parsing code. Based on the n-1 byte(s) of the instruction, next n(+1) bytes of the instruction’s format will be determined. Therefore, by parsing each byte one by one, different format of the instruction can be fully decoded by the above process function. During the parsing, it invokes consumeByte(s) function when a particular part of the instruction could be successfully decoded. The consumeByte function increases the offset variable of the decoder to present the length of the currently being parsed macroop. After the moreBytes finish the early decoding of the macroop instruction, it sets the instDone as true. However, note that moreBytes and process function just parses the macroop instruction to excerpt some bytes dedicated for each part of the instruction such as Rex and modRM in x86 architecture. Therefore, we still need to decode the parsed instruction to understand what is this instruction!

The second loop to process each instruction

After the decoder finishing early-decode of the macroop instruction, it encounters another loop that translate the macroop instruction into multiple microops if possible. Note that the processor pipeline executes the microops not the macroop instructions. Therefore, instead of the macroop, the microops should be inserted into the fetch queue.

        
      
       // Extract as many instructions and/or microops as we can from
       // the memory we've processed so far.
       do {
......
       } while ((curMacroop || decoder[tid]->instReady()) &&
                numInst < fetchWidth &&
                fetchQueue[tid].size() < fetchQueueSize);

As shown in the above code, the second loop continues until the curMacroop is not a NULL or until the translation from the current macroop to the microops is fished and the fetchQueue is available to contain translated microops. Let’s take a look at the details of the second loop.

        
      
       // Extract as many instructions and/or microops as we can from
       // the memory we've processed so far.
       do {
           if (!(curMacroop || inRom)) {
               if (decoder[tid]->instReady()) {
                   staticInst = decoder[tid]->decode(thisPC);

                   // Increment stat of fetched instructions.
                   ++fetchedInsts;

                   if (staticInst->isMacroop()) {
                       curMacroop = staticInst;
                   } else {
                       pcOffset = 0;
                   }
               } else {
                   // We need more bytes for this instruction so blkOffset and
                   // pcOffset will be updated
                   break;
               }
           }

Note that we haven’t assigned anything to curMacroop and executed the ROM code. Also, decoder[tid]->instReady is true because the moreBytes function successfully pre-decoded the macroop instruction. Therefore, it will invoke the decode function to understand which instruction actually it is. The decode function of the decoder generates the StaticInstPtr which has information about the current instruction located at thisPC. In our case, because we are firstly executing the macroop instruction, it should return the reference of the macroop instruction. Let’s briefly take a look at the decode function.

gem5/src/arch/x86/decode.cc

        
      
StaticInstPtr
Decoder::decode(PCState &nextPC)
{
   if (!instDone)
       return NULL;
   instDone = false;
   updateNPC(nextPC);

   StaticInstPtr &si = instBytes->si;
   if (si)
       return si;

   // We didn't match in the AddrMap, but we still populated an entry. Fix
   // up its byte masks.
   const int chunkSize = sizeof(MachInst);

   instBytes->lastOffset = offset;

   Addr firstBasePC = basePC - (instBytes->chunks.size() - 1) * chunkSize;
   Addr firstOffset = origPC - firstBasePC;
   Addr totalSize = instBytes->lastOffset - firstOffset +
       (instBytes->chunks.size() - 1) * chunkSize;
   int start = firstOffset;
   instBytes->masks.clear();

   while (totalSize) {
       int end = start + totalSize;
       end = (chunkSize < end) ? chunkSize : end;
       int size = end - start;
       int idx = instBytes->masks.size();

       MachInst maskVal = mask(size * 8) << (start * 8);
       assert(maskVal);

       instBytes->masks.push_back(maskVal);
       instBytes->chunks[idx] &= instBytes->masks[idx];
       totalSize -= size;
       start = 0;
   }

   si = decode(emi, origPC);
   return si;
}

There are two important things to be done by the decode function. First, it invokes updateNPC to update the next pc based on the current instruction. Also remember that the basePC has been set as fetchAddr when the moreBytes has been invoked.

        
      
   void
   updateNPC(X86ISA::PCState &nextPC)
   {
       if (!nextPC.size()) {
           int size = basePC + offset - origPC;
           DPRINTF(Decoder,
                   "Calculating the instruction size: "
                   "basePC: %#x offset: %#x origPC: %#x size: %d\n",
                   basePC, offset, origPC, size);
           nextPC.size(size);
           nextPC.npc(nextPC.pc() + size);
       }
   }

Because decoder already knows the length of the instruction, it can calculate the size of the instruction and set the nextPC value as current PC + sizeof(instruction). The npc function updates the _npc field of the nextPC, and it will be used to update the _pc member field of the PCState object later. Note that the nextPC is actually the thisPC variable declared in the fetch function. It could be confusing because the fetch function also declares the nextPC variable, but updateNPC updates the npc of the thisPC not the nextPC variable of the fetch. After updating the npc, the decode function invokes actual decode function. Also it is important that the updateNPC function is only invoked when the curMacroop is set as NULL. While the microops of the macroop is fetched, the npc will not be updated.

        
      
StaticInstPtr
Decoder::decode(ExtMachInst mach_inst, Addr addr)
{
   auto iter = instMap->find(mach_inst);
   if (iter != instMap->end())
       return iter->second;

   StaticInstPtr si = decodeInst(mach_inst);
   (*instMap)[mach_inst] = si;
   return si;
}

It traverses decode cache instMap to find the instruction object cached if the same instruction has been decoded earlier. If not, it invokes decodeInst function automatically generated based on the python parser on the GEM5. We will not cover the details of the decodeInst function in this posting. Let’s go back to the second loop again! After the decode function execution, we can finally have the object associated with the decoded instruction. If the decoded instruction is the macroop, it sets the curMacroop as the returned staticInst.

fetchMicroop: Fetching microops from the macroop or ROM

        
      
           // Whether we're moving to a new macroop because we're at the
           // end of the current one, or the branch predictor incorrectly
           // thinks we are...
           bool newMacro = false;
           if (curMacroop || inRom) {
               if (inRom) {
                   staticInst = cpu->microcodeRom.fetchMicroop(
                           thisPC.microPC(), curMacroop);
               } else {
                   staticInst = curMacroop->fetchMicroop(thisPC.microPC());
               }
               newMacro |= staticInst->isLastMicroop();
           }

The curMacroop is set as the macroop instruction pointed to by the PC. However, to execute the instruction on the pipeline, we should have access on the microops consisting of the current Macroop. You might remember that the macroop consists of multiple microops). Also, it might remind you of the ROM code. Yeah, there are two places where the microops are used. Therefore, based on the current status of the processor, whether it executes the macroop or ROM code, it needs to fetch the microops from the relevant places. Regardless of its location, GEM5 utilize the interface called fetchMicroop. When the processor is in the midst of execution of ROM code, it invokes the fetchMicroop function from the microcodeRom.

gem5/src/arch/x86/microcode_rom.hh

        
      
       StaticInstPtr
       fetchMicroop(MicroPC microPC, StaticInstPtr curMacroop)
       {
           microPC = normalMicroPC(microPC);
           if (microPC >= numMicroops)
               return X86ISA::badMicroop;
           else
               return genFuncs[microPC](curMacroop);
       }

Also when the processor is in the middle of executing the macroop, it should ask the macroop to return microops consisting of it.

gem5/src/arch/x86/insts/macroop.hh

        
      
   StaticInstPtr
   fetchMicroop(MicroPC microPC) const
   {
       if (microPC >= numMicroops)
           return badMicroop;
       else
           return microops[microPC];
   }

gem5/src/cpu/fetch_impl.hh

        
      
   StaticInstPtr staticInst = NULL;

The return value of the fetchMicroop function will be stored to the staticInst, which is the StaticInstPtr. Therefore, it can points to any instructions. Previously, the decoded macroops are pointed to by this staticInst variable. It provides a method to discern whether it is Macroop or Microop.

Populating dynamic instruction object

        
      

           DynInstPtr instruction =
               buildInst(tid, staticInst, curMacroop,
                         thisPC, nextPC, true);

           ppFetch->notify(instruction);
           numInst++;

#if TRACING_ON
           if (DTRACE(O3PipeView)) {
               instruction->fetchTick = curTick();
           }
#endif

Now we have a macroop pointed to by curMacroop variable and its associated microop pointed to by staticInst. Using this information, the buildInst function populates the dynamic object representing one instruction that can be really executed on the pipeline. One might ask why we need another object for instruction. However, note that these objects are static instruction object, but we need a dynamic instruction object that conveys all information required for executing the instruction through the pipeline. The dynamic instruction objects are populated for passing information of the instruction in between different pipeline stages. Therefore, the buildInst function generates the dynamic instruction and enqueues the instruction into the fetch queue to pass the instruction information to the next pipeline stages. Let’s take a look at how the buildInst generates the dynamic instruction.

buildInst: populating microops from the macroop

        
      
template<class Impl>
typename Impl::DynInstPtr
DefaultFetch<Impl>::buildInst(ThreadID tid, StaticInstPtr staticInst,
                             StaticInstPtr curMacroop, TheISA::PCState thisPC,
                             TheISA::PCState nextPC, bool trace)
{
   // Get a sequence number.
   InstSeqNum seq = cpu->getAndIncrementInstSeq();

   // Create a new DynInst from the instruction fetched.
   DynInstPtr instruction =
       new DynInst(staticInst, curMacroop, thisPC, nextPC, seq, cpu);
   instruction->setTid(tid);

   instruction->setASID(tid);

   instruction->setThreadState(cpu->thread[tid]);

   DPRINTF(Fetch, "[tid:%i] Instruction PC %#x (%d) created "
           "[sn:%lli].\n", tid, thisPC.instAddr(),
           thisPC.microPC(), seq);

   DPRINTF(Fetch, "[tid:%i] Instruction is: %s\n", tid,
           instruction->staticInst->
           disassemble(thisPC.instAddr()));

You can think of the DynInst as the meta data conveying all information to execute one instruction. After the instruction generation, it sets the thread specific information of the instruction (tid, ASID). Those information is required later in the execution stage to understand which instruction has been issued by which thread.

        
      
struct O3CPUImpl
{
   /** The type of MachInst. */
   typedef TheISA::MachInst MachInst;
 57
   /** The CPU policy to be used, which defines all of the CPU stages. */
   typedef SimpleCPUPolicy<O3CPUImpl> CPUPol;
 60
   /** The DynInst type to be used. */
   typedef BaseO3DynInst<O3CPUImpl> DynInst;
 63
   /** The refcounted DynInst pointer to be used.  In most cases this is
    *  what should be used, and not DynInst *.
    */
   typedef RefCountingPtr<DynInst> DynInstPtr;

The constructor call of the DynInst invokes the constructor of the BaseO3DynInst class and initialize its member field as described in the following constructor.

gem5/src/cpu/o3/dyn_inst_impl.hh

        
      
template <class Impl>
BaseO3DynInst<Impl>::BaseO3DynInst(const StaticInstPtr &staticInst,
                                  const StaticInstPtr &macroop,
                                  TheISA::PCState pc, TheISA::PCState predPC,
                                  InstSeqNum seq_num, O3CPU *cpu)
   : BaseDynInst<Impl>(staticInst, macroop, pc, predPC, seq_num, cpu)
{
   initVars();
}

Let’s take a look at who derives the DynInstPtr then.

        
      
template <class Impl>
class FullO3CPU : public BaseO3CPU
{
 public:
   // Typedefs from the Impl here.
   typedef typename Impl::CPUPol CPUPolicy;
   typedef typename Impl::DynInstPtr DynInstPtr;
   typedef typename Impl::O3CPU O3CPU;

As shown in the above code, the DynInstPtr is the Impl::DynInstPtr, which is the RefCountingPtr defined in the O3CPUImpl. The RefCountingPtr is the CPP template class defining all operations such as equal sign that can assign new object of the template type and member field reference operator -> to access the assigned object. The only additional work done by this class is counting the reference for this object, and it can be utilized as the template typed object. Therefore, without knowing the details, the instruction variable can be utilized as a pointer referencing DynInst objects.

Inserting generated dynamic instructions into the fetchQueue

        
      

#if TRACING_ON
   if (trace) {
       instruction->traceData =
           cpu->getTracer()->getInstRecord(curTick(), cpu->tcBase(tid),
                   instruction->staticInst, thisPC, curMacroop);
   }
#else
   instruction->traceData = NULL;
#endif

   // Add instruction to the CPU's list of instructions.
   instruction->setInstListIt(cpu->addInst(instruction));

   // Write the instruction to the first slot in the queue
   // that heads to decode.
   assert(numInst < fetchWidth);
   fetchQueue[tid].push_back(instruction);
   assert(fetchQueue[tid].size() <= fetchQueueSize);
   DPRINTF(Fetch, "[tid:%i] Fetch queue entry created (%i/%i).\n",
           tid, fetchQueue[tid].size(), fetchQueueSize);
   //toDecode->insts[toDecode->size++] = instruction;

   // Keep track of if we can take an interrupt at this boundary
   delayedCommit[tid] = instruction->isDelayedCommit();

   return instruction;
}

After the dynamic instruction is populated, it should be inserted into the fetchQueue to pass the generated instructions to the next stage. Now let’s go back to the second loop of the fetch function

Updating nextPC and handling branch instruction

        
      
           nextPC = thisPC;

           // If we're branching after this instruction, quit fetching
           // from the same block.
           predictedBranch |= thisPC.branching();
           predictedBranch |=
               lookupAndUpdateNextPC(instruction, nextPC);
           if (predictedBranch) {
               DPRINTF(Fetch, "Branch detected with PC = %s\n", thisPC);
           }

Until now, we have populated the microops and enqueued the generated instructions into the fetchQueue. To repeat this sequence of operations and fill the fetchQueue, the second loop should determine the nextPC to lookup. First of all, if the current instruction is one of the branching instructions, the nextPC should be determined based on the execution result of branch prediction speculatively.

lookupAndUpdateNextPC: determine the nextPC based on control flow instruction

The lookupAndUpdateNextPC determines the nextPC by checking whether the current instruction is the control flow instruction. Also, because O3 processor adopts branch predictor, the lookupAndUpdateNextPC asks branch predictor whether it needs to change the nextPC if the current instruction is the branching instruction. Note that the lookupAndUpdateNextPC accepts the dynamic instruction we generated in the buildInst function.

        
      
template <class Impl>
bool
DefaultFetch<Impl>::lookupAndUpdateNextPC(
       const DynInstPtr &inst, TheISA::PCState &nextPC)
{
   // Do branch prediction check here.
   // A bit of a misnomer...next_PC is actually the current PC until
   // this function updates it.
   bool predict_taken;

   if (!inst->isControl()) {
       TheISA::advancePC(nextPC, inst->staticInst);
       inst->setPredTarg(nextPC);
       inst->setPredTaken(false);
       return false;
   }

First of all, it can simply check if the current instruction affects execution control by invoking isControl method of the dynamic instruction. The isControl function of the dynamic instruction just invokes the same method of the staticInst of the DynInst, which is the static class representing microop operation. If the current instruction is not a control flow instruction, it just updates nextPC by invoking advancePC function with the staticInst of the current dynamic instruction (because fetching is done with the macroop level).

advancePC: advance micro pc or pc based on the architecture

gem5/src/arch/x86/utility.hh

        
      
   inline void
   advancePC(PCState &pc, const StaticInstPtr &inst)
   {
       inst->advancePC(pc);
   }

The advancePC function invokes advancePC function of the StaticInstPtr class back to back. Because we are targeting X86 architecture, the inst should be the object of the X86StaticInst class.

gem5/src/arch/x86/insts/static_inst.hh

        
      
   /**
    * Base class for all X86 static instructions.
    */

   class X86StaticInst : public StaticInst
   {
     protected:
       // Constructor.
       X86StaticInst(const char *mnem,
            ExtMachInst _machInst, OpClass __opClass)
               : StaticInst(mnem, _machInst, __opClass)
           {
           }
......
       void
       advancePC(PCState &pcState) const
       {
           pcState.advance();
       }
   };

Also, remind that X86 architecture executes the microop instead of the macroop. Therefore, the StaticInstPtr points to microop object in x86. Thus X86 on GEM5 provide another class called X86MicroopBase inheriting X86StaticInst class.

gem5/src/arch/x86/insts/microop.hh

        
      
   //A class which is the base of all x86 micro ops. It provides a function to
   //set necessary flags appropriately.
   class X86MicroopBase : public X86StaticInst
   {
     protected:
       const char * instMnem;
       uint8_t opSize;
       uint8_t addrSize;

       X86MicroopBase(ExtMachInst _machInst,
               const char *mnem, const char *_instMnem,
               uint64_t setFlags, OpClass __opClass) :
           X86ISA::X86StaticInst(mnem, _machInst, __opClass),
           instMnem(_instMnem)
       {
           const int ChunkSize = sizeof(unsigned long);
           const int Chunks = sizeof(setFlags) / ChunkSize;

           // Since the bitset constructor can only handle unsigned long
           // sized chunks, feed it those one at a time while oring them in.
           for (int i = 0; i < Chunks; i++) {
               unsigned shift = i * ChunkSize * 8;
               flags |= (std::bitset<Num_Flags>(setFlags >> shift) << shift);
           }
       }

       std::string generateDisassembly(Addr pc,
               const SymbolTable *symtab) const
       {
           std::stringstream ss;

           ccprintf(ss, "\t%s.%s", instMnem, mnemonic);

           return ss.str();
       }

       bool checkCondition(uint64_t flags, int condition) const;

       void
       advancePC(PCState &pcState) const
       {
           if (flags[IsLastMicroop])
               pcState.uEnd();
           else
               pcState.uAdvance();
       }
   };

Based on whether it is the last microop, it invokes different function of the PCState, UEnd and uAdvance respectively. Here the pcState object is the architecture specific PCState object defined as below.

PCState class

gem5/src/arch/x86/types.hh

        
      
   class PCState : public GenericISA::UPCState<MachInst>
   {
     protected:
       typedef GenericISA::UPCState<MachInst> Base;
......
       void
       advance()
       {
           Base::advance();
           _size = 0;
       }

Because the PCState doesn’t implement the uEnd and uAdvance function, we should take a look at its parent class, GenericISA::UPCState.

gem5/src/arch/generic/types.hh

        
      
// A PC and microcode PC.
template <class MachInst>
class UPCState : public SimplePCState<MachInst>
{
 protected:
   typedef SimplePCState<MachInst> Base;

   MicroPC _upc;
   MicroPC _nupc;

 public:

   MicroPC upc() const { return _upc; }
   void upc(MicroPC val) { _upc = val; }

   MicroPC nupc() const { return _nupc; }
   void nupc(MicroPC val) { _nupc = val; }
......
   bool
   branching() const
   {
       return this->npc() != this->pc() + sizeof(MachInst) ||
              this->nupc() != this->upc() + 1;
   }

   // Advance the upc within the instruction.
   void
   uAdvance()
   {
       _upc = _nupc;
       _nupc++;
   }

   // End the macroop by resetting the upc and advancing the regular pc.
   void
   uEnd()
   {
       this->advance();
       _upc = 0;
       _nupc = 1;
   }

When uAdvance function is invoked, it just updates the _upc member field representing the micro pc of the current hardware thread. However, when the uEnd is invoked, it should update the pc instead of the micro pc (upc). Because UPCState doesn’t implement the PC related member fields and functions, it invokes the advance function of its parent, SimplePCState. Note that PC represents microop, and upc represent instruction pointer among the microops consisting of one macroop.

gem5/src/arch/generic/types.hh

        
      
// The most basic type of PC.
template <class MachInst>
class SimplePCState : public PCStateBase
{
 protected:
   typedef PCStateBase Base;

 public:

   Addr pc() const { return _pc; }
   void pc(Addr val) { _pc = val; }

   Addr npc() const { return _npc; }
   void npc(Addr val) { _npc = val; }

   void
   set(Addr val)
   {
       pc(val);
       npc(val + sizeof(MachInst));
   };

   void
   setNPC(Addr val)
   {
       npc(val);
   }

   SimplePCState() {}
   SimplePCState(Addr val) { set(val); }

   bool
   branching() const
   {
       return this->npc() != this->pc() + sizeof(MachInst);
   }

   // Advance the PC.
   void
   advance()
   {
       _pc = _npc;
       _npc += sizeof(MachInst);
   }
};

It just updates the _pc as the _npc which was as a result of adding size of macroop instruction to the current pc. In other words, if it is not a control flow instruction, just adding the size of current instruction to the pc is enough to get the next pc address.

Asking branch predictor for a control flow instruction

Now let’s go back to the rest of the lookupAndUpdateNextPC function to understand what happens if the current instruction turns out to be control flow instruction.

        
      

   ThreadID tid = inst->threadNumber;
   predict_taken = branchPred->predict(inst->staticInst, inst->seqNum,
                                       nextPC, tid);

   if (predict_taken) {
       DPRINTF(Fetch, "[tid:%i] [sn:%llu] Branch at PC %#x "
               "predicted to be taken to %s\n",
               tid, inst->seqNum, inst->pcState().instAddr(), nextPC);
   } else {
       DPRINTF(Fetch, "[tid:%i] [sn:%llu] Branch at PC %#x "
               "predicted to be not taken\n",
               tid, inst->seqNum, inst->pcState().instAddr());
   }

   DPRINTF(Fetch, "[tid:%i] [sn:%llu] Branch at PC %#x "
           "predicted to go to %s\n",
           tid, inst->seqNum, inst->pcState().instAddr(), nextPC);
   inst->setPredTarg(nextPC);
   inst->setPredTaken(predict_taken);

   ++fetchedBranches;

   if (predict_taken) {
       ++predictedBranches;
   }

   return predict_taken;
}

It invokes the predict function and store the return value to the predict_taken. The predict function returns the prediction result, whether the branching instruction should be taken or not-taken (when it is not a control flow instruction, it returns not-taken to allow the next following instructions to be executed sequentially). Also, note that the reference of the nextPC is passed to the branch predictor. This is because the prediction affects the next instruction’s address. Therefore, based on the prediction result, it changes the nextPC to make the fetch stage to fetch instructions from the proper location.

End of the second loop

        
      

           newMacro |= thisPC.instAddr() != nextPC.instAddr();


Remind that we are currently executing the second loop to translate curMacroop to microops. However, when one of its microop turns out to be a control flow instruction and is predicted to be taken, it should change the PC. For that purpose, it checks the PC addresses of the thisPC and nextPC. Previously, before invoking the lookupAndUpdateNextPC function, it has allocated the thisPC to the nextPC (line 1346). However, when the prediction made as a taken, pc address of the nextPC will be changed to the location of the taken branch. Therefore, by comparing pc addresses of nextPC and thisPC, we can understand that whether we are facing another macroop or still executing the microops of the current macroop (line 1357).

        
      
// The guaranteed interface.
class PCStateBase : public Serializable
{
 protected:
   Addr _pc;
   Addr _npc;

   PCStateBase() : _pc(0), _npc(0) {}
   PCStateBase(Addr val) : _pc(0), _npc(0) { set(val); }

 public:
   /**
    * Returns the memory address the bytes of this instruction came from.
    *
    * @return Memory address of the current instruction's encoding.
    */
   Addr
   instAddr() const
   {
       return _pc;
   }

   /**
    * Returns the memory address the bytes of the next instruction came from.
    *
    * @return Memory address of the next instruction's encoding.
    */
   Addr
   nextInstAddr() const
   {
       return _npc;
   }

   /**
    * Returns the current micropc.
    *
    * @return The current micropc.
    */
   MicroPC
   microPC() const
   {
       return 0;
   }


After the newMacro flag has been set,
it assigns the nextPC to the thisPC. 
One might think that nextPC will equal to the thisPC
when the branch prediction is made to be not-taken, 
but the lookupAndUpdateNextPC advances micro-pc by invoking
advancePC function when the instruction is not a control flow 
or predicted as not-taken. 

```cpp
           // Move to the next instruction, unless we have a branch.
           thisPC = nextPC;
           inRom = isRomMicroPC(thisPC.microPC());

           if (newMacro) {
               fetchAddr = thisPC.instAddr() & BaseCPU::PCMask;
               blkOffset = (fetchAddr - fetchBufferPC[tid]) / instSize;
               pcOffset = 0;
               curMacroop = NULL;
           }

           if (instruction->isQuiesce()) {
               DPRINTF(Fetch,
                       "Quiesce instruction encountered, halting fetch!\n");
               fetchStatus[tid] = QuiescePending;
               status_change = true;
               quiesce = true;
               break;
           }
       } while ((curMacroop || decoder[tid]->instReady()) &&
                numInst < fetchWidth &&
                fetchQueue[tid].size() < fetchQueueSize);
1381
       // Re-evaluate whether the next instruction to fetch is in micro-op ROM
       // or not.

If the newMacro flag is set to true, then it should updates the addresses required to fetch next instruction and set the curMacroop as NULL. Therefore, when the new macroop is found, the second loop will exit and try to continue executing the first loop.

End of the first loop and rest

        
      
   // Loop through instruction memory from the cache.
   // Keep issuing while fetchWidth is available and branch is not
   // predicted taken
   while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize
          && !predictedBranch && !quiesce) {
......
       // Re-evaluate whether the next instruction to fetch is in micro-op ROM
       // or not.
       inRom = isRomMicroPC(thisPC.microPC());
   }

After translating macroop to microops by executing the second loop, it should continue execution on the first loop. As we checked before, when the number of fetched instruction does not exceed the fetchWidth (bandwidth) and fetchQueue does not overflow and branch prediction is not made, it will continue the all the logic that we checked until now will be repeated. Then what should be done when the first loop exits?

        
      
1386
   if (predictedBranch) {
       DPRINTF(Fetch, "[tid:%i] Done fetching, predicted branch "
               "instruction encountered.\n", tid);
   } else if (numInst >= fetchWidth) {
       DPRINTF(Fetch, "[tid:%i] Done fetching, reached fetch bandwidth "
               "for this cycle.\n", tid);
   } else if (blkOffset >= fetchBufferSize) {
       DPRINTF(Fetch, "[tid:%i] Done fetching, reached the end of the"
               "fetch buffer.\n", tid);
   }

   macroop[tid] = curMacroop;
   fetchOffset[tid] = pcOffset;

First it prints out debugging messages based on the exit condition of the first loop. And then it updates the macroop of the current hardware thread with the curMacroop. Also the fetchOffset will be updated with pcOffset.

        
      

   if (numInst > 0) {
       wroteToTimeBuffer = true;
   }

   pc[tid] = thisPC;

   // pipeline a fetch if we're crossing a fetch buffer boundary and not in
   // a state that would preclude fetching
   fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask;
   Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr);
   issuePipelinedIfetch[tid] = fetchBufferBlockPC != fetchBufferPC[tid] &&
       fetchStatus[tid] != IcacheWaitResponse &&
       fetchStatus[tid] != ItlbWait &&
       fetchStatus[tid] != IcacheWaitRetry &&
       fetchStatus[tid] != QuiescePending &&
       !curMacroop;
}

Rest of the tick function of the fetch.

Issuing the Icache access for split access?\XXX{TODO}

        
      
   // Record number of instructions fetched this cycle for distribution.
   fetchNisnDist.sample(numInst);

   if (status_change) {
       // Change the fetch stage status if there was a status change.
       _status = updateFetchStatus();
   }

   // Issue the next I-cache request if possible.
   for (ThreadID i = 0; i < numThreads; ++i) {
       if (issuePipelinedIfetch[i]) {
           pipelineIcacheAccesses(i);
       }
   }

   // Send instructions enqueued into the fetch queue to decode.
   // Limit rate by fetchWidth.  Stall if decode is stalled.
   unsigned insts_to_decode = 0;
   unsigned available_insts = 0;

   for (auto tid : *activeThreads) {
       if (!stalls[tid].decode) {
           available_insts += fetchQueue[tid].size();
       }
   }

Sending fetched instructions to decode stage

gem5/src/cpu/o3/fetch_impl.hh

        
      

   // Pick a random thread to start trying to grab instructions from
   auto tid_itr = activeThreads->begin();
   std::advance(tid_itr, random_mt.random<uint8_t>(0, activeThreads->size() - 1));

   while (available_insts != 0 && insts_to_decode < decodeWidth) {
       ThreadID tid = *tid_itr;
       if (!stalls[tid].decode && !fetchQueue[tid].empty()) {
           const auto& inst = fetchQueue[tid].front();
           toDecode->insts[toDecode->size++] = inst;
           DPRINTF(Fetch, "[tid:%i] [sn:%llu] Sending instruction to decode "
                   "from fetch queue. Fetch queue size: %i.\n",
                   tid, inst->seqNum, fetchQueue[tid].size());

           wroteToTimeBuffer = true;
           fetchQueue[tid].pop_front();
           insts_to_decode++;
           available_insts--;
       }

       tid_itr++;
       // Wrap around if at end of active threads list
       if (tid_itr == activeThreads->end())
           tid_itr = activeThreads->begin();
   }

   // If there was activity this cycle, inform the CPU of it.
   if (wroteToTimeBuffer) {
       DPRINTF(Activity, "Activity this cycle.\n");
       cpu->activityThisCycle();
   }

   // Reset the number of the instruction we've fetched.
   numInst = 0;
}   //end of the fetch.tick

The last job of the fetch stage is passing the fetched instructions to the next stage, decode stage. One the above code, toDecode member field of the fetch is used as an storage located in between the fetch and decode stage.

FetchStruct: passing fetch stage’s information to decode stage

gem5/src/cpu/o3/fetch.hh

        
      
   //Might be annoying how this name is different than the queue.
   /** Wire used to write any information heading to decode. */
   typename TimeBuffer<FetchStruct>::wire toDecode;
......
   /** Source of possible stalls. */
   struct Stalls {
       bool decode;
       bool drain;
   };

   /** Tracks which stages are telling fetch to stall. */
   Stalls stalls[Impl::MaxThreads];

The toDecode is declared as a wire class defined in the TimeBuffer class. Also, because the TimerBuffer is a template class, it passes the FetchStruct that contains all fetch stage’s information required by the decode stage. Let’s take a look at the FetchStruct to understand which information is passed to the decode stage.

gem5/src/cpu/o3/cpu_policy.hh

        
      
template<class Impl>
struct SimpleCPUPolicy
{
 ......
   /** The struct for communication between fetch and decode. */
   typedef DefaultFetchDefaultDecode<Impl> FetchStruct;

   /** The struct for communication between decode and rename. */
   typedef DefaultDecodeDefaultRename<Impl> DecodeStruct;

   /** The struct for communication between rename and IEW. */
   typedef DefaultRenameDefaultIEW<Impl> RenameStruct;

   /** The struct for communication between IEW and commit. */
   typedef DefaultIEWDefaultCommit<Impl> IEWStruct;

   /** The struct for communication within the IEW stage. */
   typedef ::IssueStruct<Impl> IssueStruct;

   /** The struct for all backwards communication. */
   typedef TimeBufStruct<Impl> TimeStruct;

gem5/src/cpu/o3/comm.h

        
      
/** Struct that defines the information passed from fetch to decode. */
template<class Impl>
struct DefaultFetchDefaultDecode {
   typedef typename Impl::DynInstPtr DynInstPtr;

   int size;

   DynInstPtr insts[Impl::MaxWidth];
   Fault fetchFault;
   InstSeqNum fetchFaultSN;
   bool clearFetchFault;
};

Most importantly, it passes the instructions fetched from the Icache.

TimeBuffer::wire generic class representing wire

The information passed from the decode stage to fetch stage is represented as multiple wires conveying bits of information. For that purpose, GEM5 provides wire class.

        
      
template <class T>
class TimeBuffer
{
 protected:
   int past;
   int future;
   unsigned size;
   int _id;

   char *data;
   std::vector<char *> index;
   unsigned base;

   void valid(int idx) const
   {
       assert (idx >= -past && idx <= future);
   }

 public:
   friend class wire;
   class wire
   {
       friend class TimeBuffer;
     protected:
       TimeBuffer<T> *buffer;
       int index;

       void set(int idx)
       {   
           buffer->valid(idx);
           index = idx;
       }

       wire(TimeBuffer<T> *buf, int i)
           : buffer(buf), index(i)
       { }

     public:
       wire()
       { }

       wire(const wire &i)
           : buffer(i.buffer), index(i.index)
       { }

       const wire &operator=(const wire &i)
       {
           buffer = i.buffer;
           set(i.index);
           return *this;
       }

       const wire &operator=(int idx)
       {
           set(idx);
           return *this;
       }

       const wire &operator+=(int offset)
       {
           set(index + offset);
           return *this;
       }

       const wire &operator-=(int offset)
       {
           set(index - offset);
           return *this;
       }

       wire &operator++()
       {
           set(index + 1);
           return *this;
       }

       wire &operator++(int)
       {
           int i = index;
           set(index + 1);
           return wire(this, i);
       }

       wire &operator--()
       {
           set(index - 1);
           return *this;
       }

       wire &operator--(int)
       {
           int i = index;
           set(index - 1);
           return wire(this, i);
       }
       T &operator*() const { return *buffer->access(index); }
       T *operator->() const { return buffer->access(index); }
   };
......
 protected:
   //Calculate the index into this->index for element at position idx
   //relative to now
   inline int calculateVectorIndex(int idx) const
   {
       //Need more complex math here to calculate index.
       valid(idx);

       int vector_index = idx + base;
       if (vector_index >= (int)size) {
           vector_index -= size;
       } else if (vector_index < 0) {
           vector_index += size;
       }

       return vector_index;
   }

 public:
   T *access(int idx)
   {
       int vector_index = calculateVectorIndex(idx);

       return reinterpret_cast<T *>(index[vector_index]);
   }

As shown in the Line 970 of the tick function of the fetch stage, it references insts member field through the -> operator. Because toDecode is declared as the TimeBuffer::wire, and this class overrides the -> operator, it will invoke the `operator function shown in line 135. \XXX{ it needs to be explained more clearly with smartpointer..}

GEM5, Pipeline, O3

This post is licensed under CC BY 4.0 by the author.