O3 Cpu Fetch
Fetch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
895 template <class Impl>
896 void
897 DefaultFetch<Impl>::tick()
898 {
899 list<ThreadID>::iterator threads = activeThreads->begin();
900 list<ThreadID>::iterator end = activeThreads->end();
901 bool status_change = false;
902
903 wroteToTimeBuffer = false;
904
905 for (ThreadID i = 0; i < numThreads; ++i) {
906 issuePipelinedIfetch[i] = false;
907 }
908
909 while (threads != end) {
910 ThreadID tid = *threads++;
911
912 // Check the signals for each thread to determine the proper status
913 // for each thread.
914 bool updated_status = checkSignalsAndUpdate(tid);
915 status_change = status_change || updated_status;
916 }
917
918 DPRINTF(Fetch, "Running stage.\n");
919
920 if (FullSystem) {
921 if (fromCommit->commitInfo[0].interruptPending) {
922 interruptPending = true;
923 }
924
925 if (fromCommit->commitInfo[0].clearInterrupt) {
926 interruptPending = false;
927 }
928 }
929
930 for (threadFetched = 0; threadFetched < numFetchingThreads;
931 threadFetched++) {
932 // Fetch each of the actively fetching threads.
933 fetch(status_change);
934 }
935
936 // Record number of instructions fetched this cycle for distribution.
937 fetchNisnDist.sample(numInst);
938
939 if (status_change) {
940 // Change the fetch stage status if there was a status change.
941 _status = updateFetchStatus();
942 }
943
944 // Issue the next I-cache request if possible.
945 for (ThreadID i = 0; i < numThreads; ++i) {
946 if (issuePipelinedIfetch[i]) {
947 pipelineIcacheAccesses(i);
948 }
949 }
950
951 // Send instructions enqueued into the fetch queue to decode.
952 // Limit rate by fetchWidth. Stall if decode is stalled.
953 unsigned insts_to_decode = 0;
954 unsigned available_insts = 0;
955
956 for (auto tid : *activeThreads) {
957 if (!stalls[tid].decode) {
958 available_insts += fetchQueue[tid].size();
959 }
960 }
961
962 // Pick a random thread to start trying to grab instructions from
963 auto tid_itr = activeThreads->begin();
964 std::advance(tid_itr, random_mt.random<uint8_t>(0, activeThreads->size() - 1));
965
966 while (available_insts != 0 && insts_to_decode < decodeWidth) {
967 ThreadID tid = *tid_itr;
968 if (!stalls[tid].decode && !fetchQueue[tid].empty()) {
969 const auto& inst = fetchQueue[tid].front();
970 toDecode->insts[toDecode->size++] = inst;
971 DPRINTF(Fetch, "[tid:%i] [sn:%llu] Sending instruction to decode "
972 "from fetch queue. Fetch queue size: %i.\n",
973 tid, inst->seqNum, fetchQueue[tid].size());
974
975 wroteToTimeBuffer = true;
976 fetchQueue[tid].pop_front();
977 insts_to_decode++;
978 available_insts--;
979 }
980
981 tid_itr++;
982 // Wrap around if at end of active threads list
983 if (tid_itr == activeThreads->end())
984 tid_itr = activeThreads->begin();
985 }
986
987 // If there was activity this cycle, inform the CPU of it.
988 if (wroteToTimeBuffer) {
989 DPRINTF(Activity, "Activity this cycle.\n");
990 cpu->activityThisCycle();
991 }
992
993 // Reset the number of the instruction we've fetched.
994 numInst = 0;
995 }
fetch: resolving TLB and cache accesses to actually fetches instructions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
1157 void
1158 DefaultFetch<Impl>::fetch(bool &status_change)
1159 {
1160 //////////////////////////////////////////
1161 // Start actual fetch
1162 //////////////////////////////////////////
1163 ThreadID tid = getFetchingThread();
1164
1165 assert(!cpu->switchedOut());
1166
1167 if (tid == InvalidThreadID) {
1168 // Breaks looping condition in tick()
1169 threadFetched = numFetchingThreads;
1170
1171 if (numThreads == 1) { // @todo Per-thread stats
1172 profileStall(0);
1173 }
1174
1175 return;
1176 }
1177
1178 DPRINTF(Fetch, "Attempting to fetch from [tid:%i]\n", tid);
1179
1180 // The current PC.
1181 TheISA::PCState thisPC = pc[tid];
1182
1183 Addr pcOffset = fetchOffset[tid];
1184 Addr fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask;
1185
1186 bool inRom = isRomMicroPC(thisPC.microPC());
1187
1188 // If returning from the delay of a cache miss, then update the status
1189 // to running, otherwise do the cache access. Possibly move this up
1190 // to tick() function.
1191 if (fetchStatus[tid] == IcacheAccessComplete) {
1192 DPRINTF(Fetch, "[tid:%i] Icache miss is complete.\n", tid);
1193
1194 fetchStatus[tid] = Running;
1195 status_change = true;
1196 } else if (fetchStatus[tid] == Running) {
1197 // Align the fetch PC so its at the start of a fetch buffer segment.
1198 Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr);
1199
1200 // If buffer is no longer valid or fetchAddr has moved to point
1201 // to the next cache block, AND we have no remaining ucode
1202 // from a macro-op, then start fetch from icache.
1203 if (!(fetchBufferValid[tid] && fetchBufferBlockPC == fetchBufferPC[tid])
1204 && !inRom && !macroop[tid]) {
1205 DPRINTF(Fetch, "[tid:%i] Attempting to translate and read "
1206 "instruction, starting at PC %s.\n", tid, thisPC);
1207
1208 fetchCacheLine(fetchAddr, tid, thisPC.instAddr());
1209
1210 if (fetchStatus[tid] == IcacheWaitResponse)
1211 ++icacheStallCycles;
1212 else if (fetchStatus[tid] == ItlbWait)
1213 ++fetchTlbCycles;
1214 else
1215 ++fetchMiscStallCycles;
1216 return;
1217 } else if ((checkInterrupt(thisPC.instAddr()) && !delayedCommit[tid])) {
1218 // Stall CPU if an interrupt is posted and we're not issuing
1219 // an delayed commit micro-op currently (delayed commit instructions
1220 // are not interruptable by interrupts, only faults)
1221 ++fetchMiscStallCycles;
1222 DPRINTF(Fetch, "[tid:%i] Fetch is stalled!\n", tid);
1223 return;
1224 }
1225 } else {
1226 if (fetchStatus[tid] == Idle) {
1227 ++fetchIdleCycles;
1228 DPRINTF(Fetch, "[tid:%i] Fetch is idle!\n", tid);
1229 }
1230
1231 // Status is Idle, so fetch should do nothing.
1232 return;
1233 }
......
1417 }
The fetch function is pretty complex and long function to analyze at once. Therefore, we will divide the fetch function in two main parts to understand entire logic of the O3CPU’s fetch stage. The first main part will explain how the fetch stage generate request to ITLB and ICache to resolve virtual to physical address translation and access the cache using the translated address. After the fetch stage receive the instructions from the ICache, the remaining part will prepare the data structure that will be passed to the next stage, decode. Let’s take a look at how the fetch function retrieve the instructions first.
First part of the fetch: ITLB to ICache access.
getFetchingThread: selecting thread to let it fetch
If there are multiple threads need to fetch next instructions, the processor should select one among them to continue fetching. Based on the policy adopted by the processor, it can return different thread based on the current status of threads.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
1445 ///////////////////////////////////////
1446 // //
1447 // SMT FETCH POLICY MAINTAINED HERE //
1448 // //
1449 ///////////////////////////////////////
1450 template<class Impl>
1451 ThreadID
1452 DefaultFetch<Impl>::getFetchingThread()
1453 {
1454 if (numThreads > 1) {
1455 switch (fetchPolicy) {
1456 case FetchPolicy::RoundRobin:
1457 return roundRobin();
1458 case FetchPolicy::IQCount:
1459 return iqCount();
1460 case FetchPolicy::LSQCount:
1461 return lsqCount();
1462 case FetchPolicy::Branch:
1463 return branchCount();
1464 default:
1465 return InvalidThreadID;
1466 }
1467 } else {
1468 list<ThreadID>::iterator thread = activeThreads->begin();
1469 if (thread == activeThreads->end()) {
1470 return InvalidThreadID;
1471 }
1472
1473 ThreadID tid = *thread;
1474
1475 if (fetchStatus[tid] == Running ||
1476 fetchStatus[tid] == IcacheAccessComplete ||
1477 fetchStatus[tid] == Idle) {
1478 return tid;
1479 } else {
1480 return InvalidThreadID;
1481 }
1482 }
1483 }
Translating virtual to physical address using I-TLB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
602 template <class Impl>
603 bool
604 DefaultFetch<Impl>::fetchCacheLine(Addr vaddr, ThreadID tid, Addr pc)
605 {
606 Fault fault = NoFault;
607
608 assert(!cpu->switchedOut());
609
610 // @todo: not sure if these should block translation.
611 //AlphaDep
612 if (cacheBlocked) {
613 DPRINTF(Fetch, "[tid:%i] Can't fetch cache line, cache blocked\n",
614 tid);
615 return false;
616 } else if (checkInterrupt(pc) && !delayedCommit[tid]) {
617 // Hold off fetch from getting new instructions when:
618 // Cache is blocked, or
619 // while an interrupt is pending and we're not in PAL mode, or
620 // fetch is switched out.
621 DPRINTF(Fetch, "[tid:%i] Can't fetch cache line, interrupt pending\n",
622 tid);
623 return false;
624 }
625
626 // Align the fetch address to the start of a fetch buffer segment.
627 Addr fetchBufferBlockPC = fetchBufferAlignPC(vaddr);
628
629 DPRINTF(Fetch, "[tid:%i] Fetching cache line %#x for addr %#x\n",
630 tid, fetchBufferBlockPC, vaddr);
631
632 // Setup the memReq to do a read of the first instruction's address.
633 // Set the appropriate read size and flags as well.
634 // Build request here.
635 RequestPtr mem_req = std::make_shared<Request>(
636 tid, fetchBufferBlockPC, fetchBufferSize,
637 Request::INST_FETCH, cpu->instMasterId(), pc,
638 cpu->thread[tid]->contextId());
639
640 mem_req->taskId(cpu->taskId());
641
642 memReq[tid] = mem_req;
643
644 // Initiate translation of the icache block
645 fetchStatus[tid] = ItlbWait;
646 FetchTranslation *trans = new FetchTranslation(this);
647 cpu->itb->translateTiming(mem_req, cpu->thread[tid]->getTC(),
648 trans, BaseTLB::Execute);
649 return true;
650 }
One can ask how the fetch stage can understand when the translation is finished. Note that FetchTranslation object is instantiated and sent to the Instruction TLB (itb) which conveys functions that should be invoked after the Translation is resolved. Therefore, when the instruction TLB finishes the translation, it invokes the function provided by the passed FetchTranslation object and let the fetch stage to process next step, initiating the cache access. Anyway, let’s take a look at which function is provided to the TLB.
gem5/src/cpu/o3/fetch.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
115 class FetchTranslation : public BaseTLB::Translation
116 {
117 protected:
118 DefaultFetch<Impl> *fetch;
119
120 public:
121 FetchTranslation(DefaultFetch<Impl> *_fetch)
122 : fetch(_fetch)
123 {}
124
125 void
126 markDelayed()
127 {}
128
129 void
130 finish(const Fault &fault, const RequestPtr &req, ThreadContext *tc,
131 BaseTLB::Mode mode)
132 {
133 assert(mode == BaseTLB::Execute);
134 fetch->finishTranslation(fault, req);
135 delete this;
136 }
137 };
You might remember that the TLB invokes the finish function at the end of the translation Yes the FetchTranslation object provide the finish function. When the TLB finishes translation, by invoking finish function, it can let the processor know the translation is resolved. The finish function further invokes the finishTranslation function defined in the DefaultFetch class.
finishTranslation: finishing TLB access and generate cache access
After the request to the TLB has been resolved, the remaining job is accessing the cache to read the instruction to fetch. Let’s take a look at how the fetch stage of the O3 CPU access the instruction cache.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
652 template <class Impl>
653 void
654 DefaultFetch<Impl>::finishTranslation(const Fault &fault,
655 const RequestPtr &mem_req)
656 {
657 ThreadID tid = cpu->contextToThread(mem_req->contextId());
658 Addr fetchBufferBlockPC = mem_req->getVaddr();
659
660 assert(!cpu->switchedOut());
661
662 // Wake up CPU if it was idle
663 cpu->wakeCPU();
664
665 if (fetchStatus[tid] != ItlbWait || mem_req != memReq[tid] ||
666 mem_req->getVaddr() != memReq[tid]->getVaddr()) {
667 DPRINTF(Fetch, "[tid:%i] Ignoring itlb completed after squash... fetchStatus:%d\n",
668 tid,fetchStatus[tid]);
669 ++fetchTlbSquashes;
670 return;
671 }
Compared to simple processor which doesn’t provide speculative execution, O3 processor utilize the branch prediction and out-of-order execution. Therefore, if the current TLB completion is notified to the O3CPU because of a misspeculation, it should drop the TLB response and stop accessing the cache. Note that the speculation can turn out to be false while it waits TLB response. Line 665-670 checks the misspeculation.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
674 // If translation was successful, attempt to read the icache block.
675 if (fault == NoFault) {
676 // Check that we're not going off into random memory
677 // If we have, just wait around for commit to squash something and put
678 // us on the right track
679 if (!cpu->system->isMemAddr(mem_req->getPaddr())) {
680 warn("Address %#x is outside of physical memory, stopping fetch\n",
681 mem_req->getPaddr());
682 fetchStatus[tid] = NoGoodAddr;
683 memReq[tid] = NULL;
684 return;
685 }
686
687 // Build packet here to access the Icache.
688 PacketPtr data_pkt = new Packet(mem_req, MemCmd::ReadReq);
689 data_pkt->dataDynamic(new uint8_t[fetchBufferSize]);
690
691 fetchBufferPC[tid] = fetchBufferBlockPC;
692 fetchBufferValid[tid] = false;
693 DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
694
695 fetchedCacheLines++;
696
697 // Access the cache.
698 if (!icachePort.sendTimingReq(data_pkt)) {
699 assert(retryPkt == NULL);
700 assert(retryTid == InvalidThreadID);
701 DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);
702
703 fetchStatus[tid] = IcacheWaitRetry;
704 retryPkt = data_pkt;
705 retryTid = tid;
706 cacheBlocked = true;
707 } else {
708 DPRINTF(Fetch, "[tid:%i] Doing Icache access.\n", tid);
709 DPRINTF(Activity, "[tid:%i] Activity: Waiting on I-cache "
710 "response.\n", tid);
711 lastIcacheStall[tid] = curTick();
712 fetchStatus[tid] = IcacheWaitResponse;
713 // Notify Fetch Request probe when a packet containing a fetch
714 // request is successfully sent
715 ppFetchRequestSent->notify(mem_req);
716 }
717 } else {
If the current TLB resolution response is valid and speculated successfully, it should generate read request packet and send it to the Instruction Cache. Line 687-695 builds the packet and send buffer to be used for containing instructions read from the cache. When the cache access request cannot be sent to the instruction cache (line 698-707) because of the cache is busy for handling previous requests, it should retry when the Instruction cache is available later. Based on the line 701, we can guess that the cache supports multiple cache accesses simultaneously, but the request can exceed the capacity of its simultaneous processing. We will see whether the GEM5 supports blocking cache access or non-blocking cache accesses in another posting. Anyway when the retry is required, it memorizes the request packet and tid. Also it changes current status as IcacheWaitRetry. When the Instruction cache is available to process the request (line 708-716), it sets current status as IcacheWaitResponse and waits until the Instruction cache resolves the request and send the actual instructions.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
717 } else {
718 // Don't send an instruction to decode if we can't handle it.
719 if (!(numInst < fetchWidth) || !(fetchQueue[tid].size() < fetchQueueSize)) {
720 assert(!finishTranslationEvent.scheduled());
721 finishTranslationEvent.setFault(fault);
722 finishTranslationEvent.setReq(mem_req);
723 cpu->schedule(finishTranslationEvent,
724 cpu->clockEdge(Cycles(1)));
725 return;
726 }
727 DPRINTF(Fetch, "[tid:%i] Got back req with addr %#x but expected %#x\n",
728 tid, mem_req->getVaddr(), memReq[tid]->getVaddr());
729 // Translation faulted, icache request won't be sent.
730 memReq[tid] = NULL;
731
732 // Send the fault to commit. This thread will not do anything
733 // until commit handles the fault. The only other way it can
734 // wake up is if a squash comes along and changes the PC.
735 TheISA::PCState fetchPC = pc[tid];
736
737 DPRINTF(Fetch, "[tid:%i] Translation faulted, building noop.\n", tid);
738 // We will use a nop in ordier to carry the fault.
739 DynInstPtr instruction = buildInst(tid, StaticInst::nopStaticInstPtr,
740 NULL, fetchPC, fetchPC, false);
741 instruction->setNotAnInst();
742
743 instruction->setPredTarg(fetchPC);
744 instruction->fault = fault;
745 wroteToTimeBuffer = true;
746
747 DPRINTF(Activity, "Activity this cycle.\n");
748 cpu->activityThisCycle();
749
750 fetchStatus[tid] = TrapPending;
751
752 DPRINTF(Fetch, "[tid:%i] Blocked, need to handle the trap.\n", tid);
753 DPRINTF(Fetch, "[tid:%i] fault (%s) detected @ PC %s.\n",
754 tid, fault->name(), pc[tid]);
755 }
756 _status = updateFetchStatus();
757 }
When the TLB translation emits fault instead of successful translation, it should be handled based on the reason of the fault. When the fetchQeueue is already full or XXX (line 719-726), instead of issuing cache access, it postpone the operation to later by scheduling the finishTranslationEvent. Note that the request packet received from the ITLB and fault structure is also included in the finishTranslationEvent to process it later.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
140 /* Event to delay delivery of a fetch translation result in case of
141 * a fault and the nop to carry the fault cannot be generated
142 * immediately */
143 class FinishTranslationEvent : public Event
144 {
145 private:
146 DefaultFetch<Impl> *fetch;
147 Fault fault;
148 RequestPtr req;
149
150 public:
151 FinishTranslationEvent(DefaultFetch<Impl> *_fetch)
152 : fetch(_fetch), req(nullptr)
153 {}
154
155 void setFault(Fault _fault)
156 {
157 fault = _fault;
158 }
159
160 void setReq(const RequestPtr &_req)
161 {
162 req = _req;
163 }
164
165 /** Process the delayed finish translation */
166 void process()
167 {
168 assert(fetch->numInst < fetch->fetchWidth);
169 fetch->finishTranslation(fault, req);
170 }
171
172 const char *description() const
173 {
174 return "FullO3CPU FetchFinishTranslation";
175 }
176 };
In detail, when the FinishTranslationEvent happens after the designated cycles passed, it invokes the process function defined in the class. As shown in the above code line 166-170, it calls finishTranslation with the passed fault and request again.
For the other reason of faults, \TODO{explanation required for the rest of the faulting code}. After the fetch stage handles the response from the ITLB, it should update the current status of the fetch stage by invoking the updateFetchStatus function.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
841 template<class Impl>
842 typename DefaultFetch<Impl>::FetchStatus
843 DefaultFetch<Impl>::updateFetchStatus()
844 {
845 //Check Running
846 list<ThreadID>::iterator threads = activeThreads->begin();
847 list<ThreadID>::iterator end = activeThreads->end();
848
849 while (threads != end) {
850 ThreadID tid = *threads++;
851
852 if (fetchStatus[tid] == Running ||
853 fetchStatus[tid] == Squashing ||
854 fetchStatus[tid] == IcacheAccessComplete) {
855
856 if (_status == Inactive) {
857 DPRINTF(Activity, "[tid:%i] Activating stage.\n",tid);
858
859 if (fetchStatus[tid] == IcacheAccessComplete) {
860 DPRINTF(Activity, "[tid:%i] Activating fetch due to cache"
861 "completion\n",tid);
862 }
863
864 cpu->activateStage(O3CPU::FetchIdx);
865 }
866
867 return Active;
868 }
869 }
870
871 // Stage is switching from active to inactive, notify CPU of it.
872 if (_status == Active) {
873 DPRINTF(Activity, "Deactivating stage.\n");
874
875 cpu->deactivateStage(O3CPU::FetchIdx);
876 }
877
878 return Inactive;
879 }
processCacheCompletion: completing ICache access
When the sendTimingReq is invoked through the icachePort, which means cache access request sent to the Instruction cache successfully, after few cycles elapsed, the O3CPU will be notified that the cache read completes. The cache access completion is handled by the recvTimingResp of the IcachePort allocated for the O3CPU.
1
2
3
4
5
6
7
8
9
10
11
12
1676 template<class Impl>
1677 bool
1678 DefaultFetch<Impl>::IcachePort::recvTimingResp(PacketPtr pkt)
1679 {
1680 DPRINTF(O3CPU, "Fetch unit received timing\n");
1681 // We shouldn't ever get a cacheable block in Modified state
1682 assert(pkt->req->isUncacheable() ||
1683 !(pkt->cacheResponding() && !pkt->hasSharers()));
1684 fetch->processCacheCompletion(pkt);
1685
1686 return true;
1687 }
When it receives the instructions from the cache, it invokes the processCacheCompletion function and ask this function to handle the response arrived from the cache.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
389 DefaultFetch<Impl>::processCacheCompletion(PacketPtr pkt)
390 {
391 ThreadID tid = cpu->contextToThread(pkt->req->contextId());
392
393 DPRINTF(Fetch, "[tid:%i] Waking up from cache miss.\n", tid);
394 assert(!cpu->switchedOut());
395
396 // Only change the status if it's still waiting on the icache access
397 // to return.
398 if (fetchStatus[tid] != IcacheWaitResponse ||
399 pkt->req != memReq[tid]) {
400 ++fetchIcacheSquashes;
401 delete pkt;
402 return;
403 }
404
405 memcpy(fetchBuffer[tid], pkt->getConstPtr<uint8_t>(), fetchBufferSize);
406 fetchBufferValid[tid] = true;
407
408 // Wake up the CPU (if it went to sleep and was waiting on
409 // this completion event).
410 cpu->wakeCPU();
411
412 DPRINTF(Activity, "[tid:%i] Activating fetch due to cache completion\n",
413 tid);
414
415 switchToActive();
416
417 // Only switch to IcacheAccessComplete if we're not stalled as well.
418 if (checkStall(tid)) {
419 fetchStatus[tid] = Blocked;
420 } else {
421 fetchStatus[tid] = IcacheAccessComplete;
422 }
423
424 pkt->req->setAccessLatency();
425 cpu->ppInstAccessComplete->notify(pkt);
426 // Reset the mem req to NULL.
427 delete pkt;
428 memReq[tid] = NULL;
429 }
When the instructions from the cache arrives, it could be the case where the misspeculation had initiated the cache access. In that case, it should drop the cache access by deleting the response packet. In other cases, the read instructions should be copied from the packet to the fetchBuffer containing the fetched instructions (line 405-406). When the current tid is stalled because of some events (we will cover which condition makes the thread to be stalled), it should be blocked until the stall is resolved. If there is no stall, then the fetchStatus can be changed to IcacheAccessComplete, which means the thread can finish the fetch stage. Now let’s go back to the fetch function again!
Revisiting fetch stage to handle the instructions fetched from the cache
Fetch tick happens every processor tick
One important thing to note is that Fetch stage is always executed at every clock cycle. However, based on the current status of the processor and other components such as TLB and cache, fetch stage cannot produce meaningful progress and should wait until the other component finish their operations. Although modern processors have multiple cores to execute, but if the all cores are waiting the cache accesses, no other hardware thread cannot execute the fetch stage. The getFetchingThread function checks the status of the all hardware threads and returns thread if there is one that can execute the fetch stage.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
1156 template<class Impl>
1157 void
1158 DefaultFetch<Impl>::fetch(bool &status_change)
1159 {
1160 //////////////////////////////////////////
1161 // Start actual fetch
1162 //////////////////////////////////////////
1163 ThreadID tid = getFetchingThread();
1164
1165 assert(!cpu->switchedOut());
1166
1167 if (tid == InvalidThreadID) {
1168 // Breaks looping condition in tick()
1169 threadFetched = numFetchingThreads;
1170
1171 if (numThreads == 1) { // @todo Per-thread stats
1172 profileStall(0);
1173 }
1174
1175 return;
1176 }
As shown in the above code, when there is no available hardware thread to execute fetch stage, getFetchingThread returns InvalidThreadID, and no thread can produce progress at that clock cycle. Only the case where the getFetchingThread returns an available thread is the thread is in one of the three fetchStatus: Running, IcacheAccessComplete, or Idle.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
1000: system.cpu.fetch: Running stage.
1000: system.cpu.fetch: Attempting to fetch from [tid:0]
1000: system.cpu.fetch: [tid:0] Attempting to translate and read instruction, starting at PC (0x7ffff8000090=>0x7ffff8000098).(0=>1).
1000: system.cpu.fetch: [tid:0] Fetching cache line 0x7ffff8000080 for addr 0x7ffff8000090
1000: system.cpu.fetch: Fetch: Doing instruction read.
1000: system.cpu.fetch: [tid:0] Doing Icache access.
1500: system.cpu.fetch: Running stage.
1500: system.cpu.fetch: There are no more threads available to fetch from.
1500: system.cpu.fetch: [tid:0] Fetch is waiting cache response!
2000: system.cpu.fetch: Running stage.
2000: system.cpu.fetch: There are no more threads available to fetch from.
2000: system.cpu.fetch: [tid:0] Fetch is waiting cache response!
2500: system.cpu.fetch: Running stage.
2500: system.cpu.fetch: There are no more threads available to fetch from.
2500: system.cpu.fetch: [tid:0] Fetch is waiting cache response!
3000: system.cpu.fetch: Running stage.
3000: system.cpu.fetch: There are no more threads available to fetch from.
3000: system.cpu.fetch: [tid:0] Fetch is waiting cache response!
3500: system.cpu.fetch: Running stage.
3500: system.cpu.fetch: There are no more threads available to fetch from.
3500: system.cpu.fetch: [tid:0] Fetch is waiting cache response!
4000: system.cpu.fetch: Running stage.
4000: system.cpu.fetch: There are no more threads available to fetch from.
4000: system.cpu.fetch: [tid:0] Fetch is waiting cache response!
4500: system.cpu.fetch: Running stage.
4500: system.cpu.fetch: There are no more threads available to fetch from.
4500: system.cpu.fetch: [tid:0] Fetch is waiting cache response!
5000: system.cpu.fetch: Running stage.
5000: system.cpu.fetch: There are no more threads available to fetch from.
5000: system.cpu.fetch: [tid:0] Fetch is waiting cache response!
5500: system.cpu.fetch: Running stage.
5500: system.cpu.fetch: There are no more threads available to fetch from.
5500: system.cpu.fetch: [tid:0] Fetch is waiting cache response!
78000: system.cpu.fetch: [tid:0] Waking up from cache miss.
78001: system.cpu.fetch: [tid:0] Waking up from cache miss.
78500: system.cpu.fetch: Running stage.
78500: system.cpu.fetch: Attempting to fetch from [tid:0]
78500: system.cpu.fetch: [tid:0] Icache miss is complete.
In our current system, because we only have one hardware thread, while it waits for the ICache miss to be resolved, it cannot execute fetch stage to produce further progress. The described behavior of the fetch stage is described in the above log. After the thread first fetches the instructions at cycle 1000, it cannot produce any progress until the ICache miss is resolved at cycle 78000. After the ICache miss is resolved (after 78500 cycle), it can finally produce progress from the fetch stage. Remember that when a missed ICache is resolved by the processCacheCompletion function, it changes the fetchStatus of the thread from IcacheWaitResponse to IcacheAccessComplete. Therefore, when the fetch stage is executed once again, the undiscovered path will be executed.
1
2
3
4
5
6
7
8
9
1188 // If returning from the delay of a cache miss, then update the status
1189 // to running, otherwise do the cache access. Possibly move this up
1190 // to tick() function.
1191 if (fetchStatus[tid] == IcacheAccessComplete) {
1192 DPRINTF(Fetch, "[tid:%i] Icache miss is complete.\n", tid);
1193
1194 fetchStatus[tid] = Running;
1195 status_change = true;
1196 } else if (fetchStatus[tid] == Running) {
Compared to the initial fetch execution that initiated the ITLB and ICache accesses, because the fetchStatus has been changed to IcacheAccessComplete, the fetch stage can execute the rest of the fetch function at this moment. Let’s take a look at the rest of the fetch function in detail.
fetchBuffer contains actual instructions for a particular hardware thread
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
1235 //when a requested instruction cache block is arrived(IcacheAccessComplete)
1236 ++fetchCycles;
1237
1238 TheISA::PCState nextPC = thisPC;
1239
1240 StaticInstPtr staticInst = NULL;
1241 StaticInstPtr curMacroop = macroop[tid];
1242
1243 // If the read of the first instruction was successful, then grab the
1244 // instructions from the rest of the cache line and put them into the
1245 // queue heading to decode.
1246
1247 DPRINTF(Fetch, "[tid:%i] Adding instructions to queue to "
1248 "decode.\n", tid);
1249
1250 // Need to keep track of whether or not a predicted branch
1251 // ended this fetch block.
1252 bool predictedBranch = false;
1253
1254 // Need to halt fetch if quiesce instruction detected
1255 bool quiesce = false;
1256
1257 TheISA::MachInst *cacheInsts =
1258 reinterpret_cast<TheISA::MachInst *>(fetchBuffer[tid]);
1259
1260 const unsigned numInsts = fetchBufferSize / instSize;
1261 unsigned blkOffset = (fetchAddr - fetchBufferPC[tid]) / instSize;
Remember that the fetchBuffer[tid] contains the actual instructions read from the ICache. Note that cacheInsts variable which is the TheISA::MachInst * type references the instruction buffer, fetchBuffer[tid]. This variable is passed to the decoder to pass the instruction stream read from the ICache. Also, the TheISA::MachInst is a uint64_t in the x86 architecture (TheISA will be changed to the X86 namespace). Because X86 architecture adopts variable instruction length, it approximately set the instruction length as 8bytes and calculate the number of instructions in the instruction stream fetched from the ICache. Note that the numInsts is approximated as fetchBufferSize / instSize.
The main fetchloop processing instructions
1
2
3
4
5
6
7
8
9
10
1263 // Loop through instruction memory from the cache.
1264 // Keep issuing while fetchWidth is available and branch is not
1265 // predicted taken
1266 while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize
1267 && !predictedBranch && !quiesce) {
......
1382 // Re-evaluate whether the next instruction to fetch is in micro-op ROM
1383 // or not.
1384 inRom = isRomMicroPC(thisPC.microPC());
1385 }
The while loop (line 1266-1267) is the main body of processing instructions stored in the fetchBuffer. Be careful not to confuse numInst with numInsts. numInst means the number of instructions fetched at this cycle, and numInsts means the number of instructions that can possibly reside in the fetchBuffer. Also, fetchQueue is the CPP standard deque managing DynInstPtr which is the pointer of one macroop instruction. Therefore, the loop checks first whether the number of fetched instructions at this cycle exceed the deisgnated fetchWidth and examine whether the fetchQueue is overflowed, which means too many instructions have been fetched from the instruction cache. Because the instruction length can vary but the capacity of fetchQueue is limited, sometimes depending on which instructions actually reside in the fetched instruction cache, it cannot process all instructions at that cycle. Based on the fact that it checks if the fetchQueue is overflowed at every iteration, we can assume that the loop insert instruction to the fetchQueue. We will take a look at the details soon! Also it checks the type of the previous instruction handled by the loop, whether it is predictedBranch or quiesce. If the previous instruction turns out to one of these type of instruction, then the loop should not process the instruction in the fetchBuffer further and stop.
Decoder
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
1268 // We need to process more memory if we aren't going to get a
1269 // StaticInst from the rom, the current macroop, or what's already
1270 // in the decoder.
1271 bool needMem = !inRom && !curMacroop &&
1272 !decoder[tid]->instReady();
1273 fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask;
1274 Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr);
1275
1276 if (needMem) {
1277 // If buffer is no longer valid or fetchAddr has moved to point
1278 // to the next cache block then start fetch from icache.
1279 if (!fetchBufferValid[tid] ||
1280 fetchBufferBlockPC != fetchBufferPC[tid])
1281 break;
1282
1283 if (blkOffset >= numInsts) {
1284 // We need to process more memory, but we've run out of the
1285 // current block.
1286 break;
1287 }
1288
1289 decoder[tid]->moreBytes(thisPC, fetchAddr, cacheInsts[blkOffset]);
1290
1291 if (decoder[tid]->needMoreBytes()) {
1292 blkOffset++;
1293 fetchAddr += instSize;
1294 pcOffset += instSize;
1295 }
1296 }
After the all conditions are met, each iteration of the loop processes the instruction one by one. For the first execution of the fetch stage, the inRom and curMacroop are set as false and NULL respectively. Also, when the decoder object embedded in the fetch stage is initialized, the instDone variable of the decoder is set as false, which will be returned as the result of instReady function of the decoder. Therefore, the needMem should be set for the initial execution. When the needMem flag is set, which means \TODO{XXX}, it invokes moreBytes function of the decoder to decode the instruction.
1
2
3
4
5
6
7
8
9
10
11
306 //Use this to give data to the decoder. This should be used
307 //when there is control flow.
308 void moreBytes(const PCState &pc, Addr fetchPC, MachInst data)
309 {
310 DPRINTF(Decoder, "Getting more bytes.\n");
311 basePC = fetchPC;
312 offset = (fetchPC >= pc.instAddr()) ? 0 : pc.instAddr() - fetchPC;
313 fetchChunk = letoh(data);
314 outOfBytes = false;
315 process();
316 }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
74 Decoder::process()
75 {
76 //This function drives the decoder state machine.
77
78 //Some sanity checks. You shouldn't try to process more bytes if
79 //there aren't any, and you shouldn't overwrite an already
80 //decoder ExtMachInst.
81 assert(!outOfBytes);
82 assert(!instDone);
83
84 if (state == ResetState)
85 state = doResetState();
86 if (state == FromCacheState) {
87 state = doFromCacheState();
88 } else {
89 instBytes->chunks.push_back(fetchChunk);
90 }
91
92 //While there's still something to do...
93 while (!instDone && !outOfBytes) {
94 uint8_t nextByte = getNextByte();
95 switch (state) {
96 case PrefixState:
97 state = doPrefixState(nextByte);
98 break;
99 case Vex2Of2State:
100 state = doVex2Of2State(nextByte);
101 break;
102 case Vex2Of3State:
103 state = doVex2Of3State(nextByte);
104 break;
105 case Vex3Of3State:
106 state = doVex3Of3State(nextByte);
107 break;
108 case VexOpcodeState:
109 state = doVexOpcodeState(nextByte);
110 break;
111 case OneByteOpcodeState:
112 state = doOneByteOpcodeState(nextByte);
113 break;
114 case TwoByteOpcodeState:
115 state = doTwoByteOpcodeState(nextByte);
116 break;
117 case ThreeByte0F38OpcodeState:
118 state = doThreeByte0F38OpcodeState(nextByte);
119 break;
120 case ThreeByte0F3AOpcodeState:
121 state = doThreeByte0F3AOpcodeState(nextByte);
122 break;
123 case ModRMState:
124 state = doModRMState(nextByte);
125 break;
126 case SIBState:
127 state = doSIBState(nextByte);
128 break;
129 case DisplacementState:
130 state = doDisplacementState();
131 break;
132 case ImmediateState:
133 state = doImmediateState();
134 break;
135 case ErrorState:
136 panic("Went to the error state in the decoder.\n");
137 default:
138 panic("Unrecognized state! %d\n", state);
139 }
140 }
141 }
Based on the instruction format, different doXXX function will be invoked to parse the macroop instruction. First of all, it invokes doResetState for every macroop to initialize the variables representing the parsed instruction. Also it sets the origPC field as the PC address of the macroop instruction. After the initialization, based on the instruction format, it will invoke different parsing code. Based on the n-1 byte(s) of the instruction, next n(+1) bytes of the instruction’s format will be determined. Therefore, by parsing each byte one by one, different format of the instruction can be fully decoded by the above process function. During the parsing, it invokes consumeByte(s) function when a particular part of the instruction could be successfully decoded. The consumeByte function increases the offset variable of the decoder to present the length of the currently being parsed macroop. After the moreBytes finish the early decoding of the macroop instruction, it sets the instDone as true. However, note that moreBytes and process function just parses the macroop instruction to excerpt some bytes dedicated for each part of the instruction such as Rex and modRM in x86 architecture. Therefore, we still need to decode the parsed instruction to understand what is this instruction!
The second loop to process each instruction
After the decoder finishing early-decode of the macroop instruction, it encounters another loop that translate the macroop instruction into multiple microops if possible. Note that the processor pipeline executes the microops not the macroop instructions. Therefore, instead of the macroop, the microops should be inserted into the fetch queue.
1
2
3
4
5
6
7
1298 // Extract as many instructions and/or microops as we can from
1299 // the memory we've processed so far.
1300 do {
......
1378 } while ((curMacroop || decoder[tid]->instReady()) &&
1379 numInst < fetchWidth &&
1380 fetchQueue[tid].size() < fetchQueueSize);
As shown in the above code, the second loop continues until the curMacroop is not a NULL or until the translation from the current macroop to the microops is fished and the fetchQueue is available to contain translated microops. Let’s take a look at the details of the second loop.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
1298 // Extract as many instructions and/or microops as we can from
1299 // the memory we've processed so far.
1300 do {
1301 if (!(curMacroop || inRom)) {
1302 if (decoder[tid]->instReady()) {
1303 staticInst = decoder[tid]->decode(thisPC);
1304
1305 // Increment stat of fetched instructions.
1306 ++fetchedInsts;
1307
1308 if (staticInst->isMacroop()) {
1309 curMacroop = staticInst;
1310 } else {
1311 pcOffset = 0;
1312 }
1313 } else {
1314 // We need more bytes for this instruction so blkOffset and
1315 // pcOffset will be updated
1316 break;
1317 }
1318 }
Note that we haven’t assigned anything to curMacroop and executed the ROM code. Also, decoder[tid]->instReady is true because the moreBytes function successfully pre-decoded the macroop instruction. Therefore, it will invoke the decode function to understand which instruction actually it is. The decode function of the decoder generates the StaticInstPtr which has information about the current instruction located at thisPC. In our case, because we are firstly executing the macroop instruction, it should return the reference of the macroop instruction. Let’s briefly take a look at the decode function.
gem5/src/arch/x86/decode.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
693 StaticInstPtr
694 Decoder::decode(PCState &nextPC)
695 {
696 if (!instDone)
697 return NULL;
698 instDone = false;
699 updateNPC(nextPC);
700
701 StaticInstPtr &si = instBytes->si;
702 if (si)
703 return si;
704
705 // We didn't match in the AddrMap, but we still populated an entry. Fix
706 // up its byte masks.
707 const int chunkSize = sizeof(MachInst);
708
709 instBytes->lastOffset = offset;
710
711 Addr firstBasePC = basePC - (instBytes->chunks.size() - 1) * chunkSize;
712 Addr firstOffset = origPC - firstBasePC;
713 Addr totalSize = instBytes->lastOffset - firstOffset +
714 (instBytes->chunks.size() - 1) * chunkSize;
715 int start = firstOffset;
716 instBytes->masks.clear();
717
718 while (totalSize) {
719 int end = start + totalSize;
720 end = (chunkSize < end) ? chunkSize : end;
721 int size = end - start;
722 int idx = instBytes->masks.size();
723
724 MachInst maskVal = mask(size * 8) << (start * 8);
725 assert(maskVal);
726
727 instBytes->masks.push_back(maskVal);
728 instBytes->chunks[idx] &= instBytes->masks[idx];
729 totalSize -= size;
730 start = 0;
731 }
732
733 si = decode(emi, origPC);
734 return si;
735 }
There are two important things to be done by the decode function. First, it invokes updateNPC to update the next pc based on the current instruction. Also remember that the basePC has been set as fetchAddr when the moreBytes has been invoked.
1
2
3
4
5
6
7
8
9
10
11
12
13
328 void
329 updateNPC(X86ISA::PCState &nextPC)
330 {
331 if (!nextPC.size()) {
332 int size = basePC + offset - origPC;
333 DPRINTF(Decoder,
334 "Calculating the instruction size: "
335 "basePC: %#x offset: %#x origPC: %#x size: %d\n",
336 basePC, offset, origPC, size);
337 nextPC.size(size);
338 nextPC.npc(nextPC.pc() + size);
339 }
340 }
Because decoder already knows the length of the instruction, it can calculate the size of the instruction and set the nextPC value as current PC + sizeof(instruction). The npc function updates the _npc field of the nextPC, and it will be used to update the _pc member field of the PCState object later. Note that the nextPC is actually the thisPC variable declared in the fetch function. It could be confusing because the fetch function also declares the nextPC variable, but updateNPC updates the npc of the thisPC not the nextPC variable of the fetch. After updating the npc, the decode function invokes actual decode function. Also it is important that the updateNPC function is only invoked when the curMacroop is set as NULL. While the microops of the macroop is fetched, the npc will not be updated.
1
2
3
4
5
6
7
8
9
10
11
681 StaticInstPtr
682 Decoder::decode(ExtMachInst mach_inst, Addr addr)
683 {
684 auto iter = instMap->find(mach_inst);
685 if (iter != instMap->end())
686 return iter->second;
687
688 StaticInstPtr si = decodeInst(mach_inst);
689 (*instMap)[mach_inst] = si;
690 return si;
691 }
It traverses decode cache instMap to find the instruction object cached if the same instruction has been decoded earlier. If not, it invokes decodeInst function automatically generated based on the python parser on the GEM5. We will not cover the details of the decodeInst function in this posting. Let’s go back to the second loop again! After the decode function execution, we can finally have the object associated with the decoded instruction. If the decoded instruction is the macroop, it sets the curMacroop as the returned staticInst.
fetchMicroop: Fetching microops from the macroop or ROM
1
2
3
4
5
6
7
8
9
10
11
12
13
1319 // Whether we're moving to a new macroop because we're at the
1320 // end of the current one, or the branch predictor incorrectly
1321 // thinks we are...
1322 bool newMacro = false;
1323 if (curMacroop || inRom) {
1324 if (inRom) {
1325 staticInst = cpu->microcodeRom.fetchMicroop(
1326 thisPC.microPC(), curMacroop);
1327 } else {
1328 staticInst = curMacroop->fetchMicroop(thisPC.microPC());
1329 }
1330 newMacro |= staticInst->isLastMicroop();
1331 }
The curMacroop is set as the macroop instruction pointed to by the PC. However, to execute the instruction on the pipeline, we should have access on the microops consisting of the current Macroop. You might remember that the macroop consists of multiple microops). Also, it might remind you of the ROM code. Yeah, there are two places where the microops are used. Therefore, based on the current status of the processor, whether it executes the macroop or ROM code, it needs to fetch the microops from the relevant places. Regardless of its location, GEM5 utilize the interface called fetchMicroop. When the processor is in the midst of execution of ROM code, it invokes the fetchMicroop function from the microcodeRom.
gem5/src/arch/x86/microcode_rom.hh
1
2
3
4
5
6
7
8
9
60 StaticInstPtr
61 fetchMicroop(MicroPC microPC, StaticInstPtr curMacroop)
62 {
63 microPC = normalMicroPC(microPC);
64 if (microPC >= numMicroops)
65 return X86ISA::badMicroop;
66 else
67 return genFuncs[microPC](curMacroop);
68 }
Also when the processor is in the middle of executing the macroop, it should ask the macroop to return microops consisting of it.
gem5/src/arch/x86/insts/macroop.hh
1
2
3
4
5
6
7
8
77 StaticInstPtr
78 fetchMicroop(MicroPC microPC) const
79 {
80 if (microPC >= numMicroops)
81 return badMicroop;
82 else
83 return microops[microPC];
84 }
gem5/src/cpu/fetch_impl.hh
1
1239 StaticInstPtr staticInst = NULL;
The return value of the fetchMicroop function will be stored to the staticInst, which is the StaticInstPtr. Therefore, it can points to any instructions. Previously, the decoded macroops are pointed to by this staticInst variable. It provides a method to discern whether it is Macroop or Microop.
Populating dynamic instruction object
1
2
3
4
5
6
7
8
9
10
11
12
13
1332
1333 DynInstPtr instruction =
1334 buildInst(tid, staticInst, curMacroop,
1335 thisPC, nextPC, true);
1336
1337 ppFetch->notify(instruction);
1338 numInst++;
1339
1340 #if TRACING_ON
1341 if (DTRACE(O3PipeView)) {
1342 instruction->fetchTick = curTick();
1343 }
1344 #endif
Now we have a macroop pointed to by curMacroop variable and its associated microop pointed to by staticInst. Using this information, the buildInst function populates the dynamic object representing one instruction that can be really executed on the pipeline. One might ask why we need another object for instruction. However, note that these objects are static instruction object, but we need a dynamic instruction object that conveys all information required for executing the instruction through the pipeline. The dynamic instruction objects are populated for passing information of the instruction in between different pipeline stages. Therefore, the buildInst function generates the dynamic instruction and enqueues the instruction into the fetch queue to pass the instruction information to the next pipeline stages. Let’s take a look at how the buildInst generates the dynamic instruction.
buildInst: populating microops from the macroop
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
1102 template<class Impl>
1103 typename Impl::DynInstPtr
1104 DefaultFetch<Impl>::buildInst(ThreadID tid, StaticInstPtr staticInst,
1105 StaticInstPtr curMacroop, TheISA::PCState thisPC,
1106 TheISA::PCState nextPC, bool trace)
1107 {
1108 // Get a sequence number.
1109 InstSeqNum seq = cpu->getAndIncrementInstSeq();
1110
1111 // Create a new DynInst from the instruction fetched.
1112 DynInstPtr instruction =
1113 new DynInst(staticInst, curMacroop, thisPC, nextPC, seq, cpu);
1114 instruction->setTid(tid);
1115
1116 instruction->setASID(tid);
1117
1118 instruction->setThreadState(cpu->thread[tid]);
1119
1120 DPRINTF(Fetch, "[tid:%i] Instruction PC %#x (%d) created "
1121 "[sn:%lli].\n", tid, thisPC.instAddr(),
1122 thisPC.microPC(), seq);
1123
1124 DPRINTF(Fetch, "[tid:%i] Instruction is: %s\n", tid,
1125 instruction->staticInst->
1126 disassemble(thisPC.instAddr()));
You can think of the DynInst as the meta data conveying all information to execute one instruction. After the instruction generation, it sets the thread specific information of the instruction (tid, ASID). Those information is required later in the execution stage to understand which instruction has been issued by which thread.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
53 struct O3CPUImpl
54 {
55 /** The type of MachInst. */
56 typedef TheISA::MachInst MachInst;
57
58 /** The CPU policy to be used, which defines all of the CPU stages. */
59 typedef SimpleCPUPolicy<O3CPUImpl> CPUPol;
60
61 /** The DynInst type to be used. */
62 typedef BaseO3DynInst<O3CPUImpl> DynInst;
63
64 /** The refcounted DynInst pointer to be used. In most cases this is
65 * what should be used, and not DynInst *.
66 */
67 typedef RefCountingPtr<DynInst> DynInstPtr;
The constructor call of the DynInst invokes the constructor of the BaseO3DynInst class and initialize its member field as described in the following constructor.
gem5/src/cpu/o3/dyn_inst_impl.hh
1
2
3
4
5
6
7
8
9
50 template <class Impl>
51 BaseO3DynInst<Impl>::BaseO3DynInst(const StaticInstPtr &staticInst,
52 const StaticInstPtr ¯oop,
53 TheISA::PCState pc, TheISA::PCState predPC,
54 InstSeqNum seq_num, O3CPU *cpu)
55 : BaseDynInst<Impl>(staticInst, macroop, pc, predPC, seq_num, cpu)
56 {
57 initVars();
58 }
Let’s take a look at who derives the DynInstPtr then.
1
2
3
4
5
6
7
8
97 template <class Impl>
98 class FullO3CPU : public BaseO3CPU
99 {
100 public:
101 // Typedefs from the Impl here.
102 typedef typename Impl::CPUPol CPUPolicy;
103 typedef typename Impl::DynInstPtr DynInstPtr;
104 typedef typename Impl::O3CPU O3CPU;
As shown in the above code,
the DynInstPtr is the Impl::DynInstPtr,
which is the RefCountingPtr
Inserting generated dynamic instructions into the fetchQueue
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
1127
1128 #if TRACING_ON
1129 if (trace) {
1130 instruction->traceData =
1131 cpu->getTracer()->getInstRecord(curTick(), cpu->tcBase(tid),
1132 instruction->staticInst, thisPC, curMacroop);
1133 }
1134 #else
1135 instruction->traceData = NULL;
1136 #endif
1137
1138 // Add instruction to the CPU's list of instructions.
1139 instruction->setInstListIt(cpu->addInst(instruction));
1140
1141 // Write the instruction to the first slot in the queue
1142 // that heads to decode.
1143 assert(numInst < fetchWidth);
1144 fetchQueue[tid].push_back(instruction);
1145 assert(fetchQueue[tid].size() <= fetchQueueSize);
1146 DPRINTF(Fetch, "[tid:%i] Fetch queue entry created (%i/%i).\n",
1147 tid, fetchQueue[tid].size(), fetchQueueSize);
1148 //toDecode->insts[toDecode->size++] = instruction;
1149
1150 // Keep track of if we can take an interrupt at this boundary
1151 delayedCommit[tid] = instruction->isDelayedCommit();
1152
1153 return instruction;
1154 }
After the dynamic instruction is populated, it should be inserted into the fetchQueue to pass the generated instructions to the next stage. Now let’s go back to the second loop of the fetch function
Updating nextPC and handling branch instruction
1
2
3
4
5
6
7
8
9
10
1346 nextPC = thisPC;
1347
1348 // If we're branching after this instruction, quit fetching
1349 // from the same block.
1350 predictedBranch |= thisPC.branching();
1351 predictedBranch |=
1352 lookupAndUpdateNextPC(instruction, nextPC);
1353 if (predictedBranch) {
1354 DPRINTF(Fetch, "Branch detected with PC = %s\n", thisPC);
1355 }
Until now, we have populated the microops and enqueued the generated instructions into the fetchQueue. To repeat this sequence of operations and fill the fetchQueue, the second loop should determine the nextPC to lookup. First of all, if the current instruction is one of the branching instructions, the nextPC should be determined based on the execution result of branch prediction speculatively.
lookupAndUpdateNextPC: determine the nextPC based on control flow instruction
The lookupAndUpdateNextPC determines the nextPC by checking whether the current instruction is the control flow instruction. Also, because O3 processor adopts branch predictor, the lookupAndUpdateNextPC asks branch predictor whether it needs to change the nextPC if the current instruction is the branching instruction. Note that the lookupAndUpdateNextPC accepts the dynamic instruction we generated in the buildInst function.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
556 template <class Impl>
557 bool
558 DefaultFetch<Impl>::lookupAndUpdateNextPC(
559 const DynInstPtr &inst, TheISA::PCState &nextPC)
560 {
561 // Do branch prediction check here.
562 // A bit of a misnomer...next_PC is actually the current PC until
563 // this function updates it.
564 bool predict_taken;
565
566 if (!inst->isControl()) {
567 TheISA::advancePC(nextPC, inst->staticInst);
568 inst->setPredTarg(nextPC);
569 inst->setPredTaken(false);
570 return false;
571 }
First of all, it can simply check if the current instruction affects execution control by invoking isControl method of the dynamic instruction. The isControl function of the dynamic instruction just invokes the same method of the staticInst of the DynInst, which is the static class representing microop operation. If the current instruction is not a control flow instruction, it just updates nextPC by invoking advancePC function with the staticInst of the current dynamic instruction (because fetching is done with the macroop level).
advancePC: advance micro pc or pc based on the architecture
gem5/src/arch/x86/utility.hh
1
2
3
4
5
78 inline void
79 advancePC(PCState &pc, const StaticInstPtr &inst)
80 {
81 inst->advancePC(pc);
82 }
The advancePC function invokes advancePC function of the StaticInstPtr class back to back. Because we are targeting X86 architecture, the inst should be the object of the X86StaticInst class.
gem5/src/arch/x86/insts/static_inst.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
77 /**
78 * Base class for all X86 static instructions.
79 */
80
81 class X86StaticInst : public StaticInst
82 {
83 protected:
84 // Constructor.
85 X86StaticInst(const char *mnem,
86 ExtMachInst _machInst, OpClass __opClass)
87 : StaticInst(mnem, _machInst, __opClass)
88 {
89 }
......
179 void
180 advancePC(PCState &pcState) const
181 {
182 pcState.advance();
183 }
184 };
Also, remind that X86 architecture executes the microop instead of the macroop. Therefore, the StaticInstPtr points to microop object in x86. Thus X86 on GEM5 provide another class called X86MicroopBase inheriting X86StaticInst class.
gem5/src/arch/x86/insts/microop.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
88 //A class which is the base of all x86 micro ops. It provides a function to
89 //set necessary flags appropriately.
90 class X86MicroopBase : public X86StaticInst
91 {
92 protected:
93 const char * instMnem;
94 uint8_t opSize;
95 uint8_t addrSize;
96
97 X86MicroopBase(ExtMachInst _machInst,
98 const char *mnem, const char *_instMnem,
99 uint64_t setFlags, OpClass __opClass) :
100 X86ISA::X86StaticInst(mnem, _machInst, __opClass),
101 instMnem(_instMnem)
102 {
103 const int ChunkSize = sizeof(unsigned long);
104 const int Chunks = sizeof(setFlags) / ChunkSize;
105
106 // Since the bitset constructor can only handle unsigned long
107 // sized chunks, feed it those one at a time while oring them in.
108 for (int i = 0; i < Chunks; i++) {
109 unsigned shift = i * ChunkSize * 8;
110 flags |= (std::bitset<Num_Flags>(setFlags >> shift) << shift);
111 }
112 }
113
114 std::string generateDisassembly(Addr pc,
115 const SymbolTable *symtab) const
116 {
117 std::stringstream ss;
118
119 ccprintf(ss, "\t%s.%s", instMnem, mnemonic);
120
121 return ss.str();
122 }
123
124 bool checkCondition(uint64_t flags, int condition) const;
125
126 void
127 advancePC(PCState &pcState) const
128 {
129 if (flags[IsLastMicroop])
130 pcState.uEnd();
131 else
132 pcState.uAdvance();
133 }
134 };
Based on whether it is the last microop, it invokes different function of the PCState, UEnd and uAdvance respectively. Here the pcState object is the architecture specific PCState object defined as below.
PCState class
gem5/src/arch/x86/types.hh
1
2
3
4
5
6
7
8
9
10
11
289 class PCState : public GenericISA::UPCState<MachInst>
290 {
291 protected:
292 typedef GenericISA::UPCState<MachInst> Base;
......
324 void
325 advance()
326 {
327 Base::advance();
328 _size = 0;
329 }
Because the PCState doesn’t implement the uEnd and uAdvance function,
we should take a look at its parent class,
GenericISA::UPCState
gem5/src/arch/generic/types.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
193 // A PC and microcode PC.
194 template <class MachInst>
195 class UPCState : public SimplePCState<MachInst>
196 {
197 protected:
198 typedef SimplePCState<MachInst> Base;
199
200 MicroPC _upc;
201 MicroPC _nupc;
202
203 public:
204
205 MicroPC upc() const { return _upc; }
206 void upc(MicroPC val) { _upc = val; }
207
208 MicroPC nupc() const { return _nupc; }
209 void nupc(MicroPC val) { _nupc = val; }
......
228 bool
229 branching() const
230 {
231 return this->npc() != this->pc() + sizeof(MachInst) ||
232 this->nupc() != this->upc() + 1;
233 }
234
235 // Advance the upc within the instruction.
236 void
237 uAdvance()
238 {
239 _upc = _nupc;
240 _nupc++;
241 }
242
243 // End the macroop by resetting the upc and advancing the regular pc.
244 void
245 uEnd()
246 {
247 this->advance();
248 _upc = 0;
249 _nupc = 1;
250 }
When uAdvance function is invoked, it just updates the _upc member field representing the micro pc of the current hardware thread. However, when the uEnd is invoked, it should update the pc instead of the micro pc (upc). Because UPCState doesn’t implement the PC related member fields and functions, it invokes the advance function of its parent, SimplePCState. Note that PC represents microop, and upc represent instruction pointer among the microops consisting of one macroop.
gem5/src/arch/generic/types.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
139 // The most basic type of PC.
140 template <class MachInst>
141 class SimplePCState : public PCStateBase
142 {
143 protected:
144 typedef PCStateBase Base;
145
146 public:
147
148 Addr pc() const { return _pc; }
149 void pc(Addr val) { _pc = val; }
150
151 Addr npc() const { return _npc; }
152 void npc(Addr val) { _npc = val; }
153
154 void
155 set(Addr val)
156 {
157 pc(val);
158 npc(val + sizeof(MachInst));
159 };
160
161 void
162 setNPC(Addr val)
163 {
164 npc(val);
165 }
166
167 SimplePCState() {}
168 SimplePCState(Addr val) { set(val); }
169
170 bool
171 branching() const
172 {
173 return this->npc() != this->pc() + sizeof(MachInst);
174 }
175
176 // Advance the PC.
177 void
178 advance()
179 {
180 _pc = _npc;
181 _npc += sizeof(MachInst);
182 }
183 };
It just updates the _pc as the _npc which was as a result of adding size of macroop instruction to the current pc. In other words, if it is not a control flow instruction, just adding the size of current instruction to the pc is enough to get the next pc address.
Asking branch predictor for a control flow instruction
Now let’s go back to the rest of the lookupAndUpdateNextPC function to understand what happens if the current instruction turns out to be control flow instruction.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
572
573 ThreadID tid = inst->threadNumber;
574 predict_taken = branchPred->predict(inst->staticInst, inst->seqNum,
575 nextPC, tid);
576
577 if (predict_taken) {
578 DPRINTF(Fetch, "[tid:%i] [sn:%llu] Branch at PC %#x "
579 "predicted to be taken to %s\n",
580 tid, inst->seqNum, inst->pcState().instAddr(), nextPC);
581 } else {
582 DPRINTF(Fetch, "[tid:%i] [sn:%llu] Branch at PC %#x "
583 "predicted to be not taken\n",
584 tid, inst->seqNum, inst->pcState().instAddr());
585 }
586
587 DPRINTF(Fetch, "[tid:%i] [sn:%llu] Branch at PC %#x "
588 "predicted to go to %s\n",
589 tid, inst->seqNum, inst->pcState().instAddr(), nextPC);
590 inst->setPredTarg(nextPC);
591 inst->setPredTaken(predict_taken);
592
593 ++fetchedBranches;
594
595 if (predict_taken) {
596 ++predictedBranches;
597 }
598
599 return predict_taken;
600 }
It invokes the predict function and store the return value to the predict_taken. The predict function returns the prediction result, whether the branching instruction should be taken or not-taken (when it is not a control flow instruction, it returns not-taken to allow the next following instructions to be executed sequentially). Also, note that the reference of the nextPC is passed to the branch predictor. This is because the prediction affects the next instruction’s address. Therefore, based on the prediction result, it changes the nextPC to make the fetch stage to fetch instructions from the proper location.
End of the second loop
1
2
3
1356
1357 newMacro |= thisPC.instAddr() != nextPC.instAddr();
1358
Remind that we are currently executing the second loop to translate curMacroop to microops. However, when one of its microop turns out to be a control flow instruction and is predicted to be taken, it should change the PC. For that purpose, it checks the PC addresses of the thisPC and nextPC. Previously, before invoking the lookupAndUpdateNextPC function, it has allocated the thisPC to the nextPC (line 1346). However, when the prediction made as a taken, pc address of the nextPC will be changed to the location of the taken branch. Therefore, by comparing pc addresses of nextPC and thisPC, we can understand that whether we are facing another macroop or still executing the microops of the current macroop (line 1357).
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
53 // The guaranteed interface.
54 class PCStateBase : public Serializable
55 {
56 protected:
57 Addr _pc;
58 Addr _npc;
59
60 PCStateBase() : _pc(0), _npc(0) {}
61 PCStateBase(Addr val) : _pc(0), _npc(0) { set(val); }
62
63 public:
64 /**
65 * Returns the memory address the bytes of this instruction came from.
66 *
67 * @return Memory address of the current instruction's encoding.
68 */
69 Addr
70 instAddr() const
71 {
72 return _pc;
73 }
74
75 /**
76 * Returns the memory address the bytes of the next instruction came from.
77 *
78 * @return Memory address of the next instruction's encoding.
79 */
80 Addr
81 nextInstAddr() const
82 {
83 return _npc;
84 }
85
86 /**
87 * Returns the current micropc.
88 *
89 * @return The current micropc.
90 */
91 MicroPC
92 microPC() const
93 {
94 return 0;
95 }
After the newMacro flag has been set,
it assigns the nextPC to the thisPC.
One might think that nextPC will equal to the thisPC
when the branch prediction is made to be not-taken,
but the lookupAndUpdateNextPC advances micro-pc by invoking
advancePC function when the instruction is not a control flow
or predicted as not-taken.
```cpp
1359 // Move to the next instruction, unless we have a branch.
1360 thisPC = nextPC;
1361 inRom = isRomMicroPC(thisPC.microPC());
1362
1363 if (newMacro) {
1364 fetchAddr = thisPC.instAddr() & BaseCPU::PCMask;
1365 blkOffset = (fetchAddr - fetchBufferPC[tid]) / instSize;
1366 pcOffset = 0;
1367 curMacroop = NULL;
1368 }
1369
1370 if (instruction->isQuiesce()) {
1371 DPRINTF(Fetch,
1372 "Quiesce instruction encountered, halting fetch!\n");
1373 fetchStatus[tid] = QuiescePending;
1374 status_change = true;
1375 quiesce = true;
1376 break;
1377 }
1378 } while ((curMacroop || decoder[tid]->instReady()) &&
1379 numInst < fetchWidth &&
1380 fetchQueue[tid].size() < fetchQueueSize);
1381
1382 // Re-evaluate whether the next instruction to fetch is in micro-op ROM
1383 // or not.
If the newMacro flag is set to true, then it should updates the addresses required to fetch next instruction and set the curMacroop as NULL. Therefore, when the new macroop is found, the second loop will exit and try to continue executing the first loop.
End of the first loop and rest
1
2
3
4
5
6
7
8
9
10
1263 // Loop through instruction memory from the cache.
1264 // Keep issuing while fetchWidth is available and branch is not
1265 // predicted taken
1266 while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize
1267 && !predictedBranch && !quiesce) {
......
1382 // Re-evaluate whether the next instruction to fetch is in micro-op ROM
1383 // or not.
1384 inRom = isRomMicroPC(thisPC.microPC());
1385 }
After translating macroop to microops by executing the second loop, it should continue execution on the first loop. As we checked before, when the number of fetched instruction does not exceed the fetchWidth (bandwidth) and fetchQueue does not overflow and branch prediction is not made, it will continue the all the logic that we checked until now will be repeated. Then what should be done when the first loop exits?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
1386
1387 if (predictedBranch) {
1388 DPRINTF(Fetch, "[tid:%i] Done fetching, predicted branch "
1389 "instruction encountered.\n", tid);
1390 } else if (numInst >= fetchWidth) {
1391 DPRINTF(Fetch, "[tid:%i] Done fetching, reached fetch bandwidth "
1392 "for this cycle.\n", tid);
1393 } else if (blkOffset >= fetchBufferSize) {
1394 DPRINTF(Fetch, "[tid:%i] Done fetching, reached the end of the"
1395 "fetch buffer.\n", tid);
1396 }
1397
1398 macroop[tid] = curMacroop;
1399 fetchOffset[tid] = pcOffset;
First it prints out debugging messages based on the exit condition of the first loop. And then it updates the macroop of the current hardware thread with the curMacroop. Also the fetchOffset will be updated with pcOffset.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
1400
1401 if (numInst > 0) {
1402 wroteToTimeBuffer = true;
1403 }
1404
1405 pc[tid] = thisPC;
1406
1407 // pipeline a fetch if we're crossing a fetch buffer boundary and not in
1408 // a state that would preclude fetching
1409 fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask;
1410 Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr);
1411 issuePipelinedIfetch[tid] = fetchBufferBlockPC != fetchBufferPC[tid] &&
1412 fetchStatus[tid] != IcacheWaitResponse &&
1413 fetchStatus[tid] != ItlbWait &&
1414 fetchStatus[tid] != IcacheWaitRetry &&
1415 fetchStatus[tid] != QuiescePending &&
1416 !curMacroop;
1417 }
Rest of the tick function of the fetch.
Issuing the Icache access for split access?\XXX{TODO}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
936 // Record number of instructions fetched this cycle for distribution.
937 fetchNisnDist.sample(numInst);
938
939 if (status_change) {
940 // Change the fetch stage status if there was a status change.
941 _status = updateFetchStatus();
942 }
943
944 // Issue the next I-cache request if possible.
945 for (ThreadID i = 0; i < numThreads; ++i) {
946 if (issuePipelinedIfetch[i]) {
947 pipelineIcacheAccesses(i);
948 }
949 }
950
951 // Send instructions enqueued into the fetch queue to decode.
952 // Limit rate by fetchWidth. Stall if decode is stalled.
953 unsigned insts_to_decode = 0;
954 unsigned available_insts = 0;
955
956 for (auto tid : *activeThreads) {
957 if (!stalls[tid].decode) {
958 available_insts += fetchQueue[tid].size();
959 }
960 }
Sending fetched instructions to decode stage
gem5/src/cpu/o3/fetch_impl.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
961
962 // Pick a random thread to start trying to grab instructions from
963 auto tid_itr = activeThreads->begin();
964 std::advance(tid_itr, random_mt.random<uint8_t>(0, activeThreads->size() - 1));
965
966 while (available_insts != 0 && insts_to_decode < decodeWidth) {
967 ThreadID tid = *tid_itr;
968 if (!stalls[tid].decode && !fetchQueue[tid].empty()) {
969 const auto& inst = fetchQueue[tid].front();
970 toDecode->insts[toDecode->size++] = inst;
971 DPRINTF(Fetch, "[tid:%i] [sn:%llu] Sending instruction to decode "
972 "from fetch queue. Fetch queue size: %i.\n",
973 tid, inst->seqNum, fetchQueue[tid].size());
974
975 wroteToTimeBuffer = true;
976 fetchQueue[tid].pop_front();
977 insts_to_decode++;
978 available_insts--;
979 }
980
981 tid_itr++;
982 // Wrap around if at end of active threads list
983 if (tid_itr == activeThreads->end())
984 tid_itr = activeThreads->begin();
985 }
986
987 // If there was activity this cycle, inform the CPU of it.
988 if (wroteToTimeBuffer) {
989 DPRINTF(Activity, "Activity this cycle.\n");
990 cpu->activityThisCycle();
991 }
992
993 // Reset the number of the instruction we've fetched.
994 numInst = 0;
995 } //end of the fetch.tick
The last job of the fetch stage is passing the fetched instructions to the next stage, decode stage. One the above code, toDecode member field of the fetch is used as an storage located in between the fetch and decode stage.
FetchStruct: passing fetch stage’s information to decode stage
gem5/src/cpu/o3/fetch.hh
1
2
3
4
5
6
7
8
9
10
11
12
431 //Might be annoying how this name is different than the queue.
432 /** Wire used to write any information heading to decode. */
433 typename TimeBuffer<FetchStruct>::wire toDecode;
......
458 /** Source of possible stalls. */
459 struct Stalls {
460 bool decode;
461 bool drain;
462 };
463
464 /** Tracks which stages are telling fetch to stall. */
465 Stalls stalls[Impl::MaxThreads];
The toDecode is declared as a wire class defined in the TimeBuffer class. Also, because the TimerBuffer is a template class, it passes the FetchStruct that contains all fetch stage’s information required by the decode stage. Let’s take a look at the FetchStruct to understand which information is passed to the decode stage.
gem5/src/cpu/o3/cpu_policy.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
60 template<class Impl>
61 struct SimpleCPUPolicy
62 {
......
89 /** The struct for communication between fetch and decode. */
90 typedef DefaultFetchDefaultDecode<Impl> FetchStruct;
91
92 /** The struct for communication between decode and rename. */
93 typedef DefaultDecodeDefaultRename<Impl> DecodeStruct;
94
95 /** The struct for communication between rename and IEW. */
96 typedef DefaultRenameDefaultIEW<Impl> RenameStruct;
97
98 /** The struct for communication between IEW and commit. */
99 typedef DefaultIEWDefaultCommit<Impl> IEWStruct;
100
101 /** The struct for communication within the IEW stage. */
102 typedef ::IssueStruct<Impl> IssueStruct;
103
104 /** The struct for all backwards communication. */
105 typedef TimeBufStruct<Impl> TimeStruct;
gem5/src/cpu/o3/comm.h
1
2
3
4
5
6
7
8
9
10
11
12
55 /** Struct that defines the information passed from fetch to decode. */
56 template<class Impl>
57 struct DefaultFetchDefaultDecode {
58 typedef typename Impl::DynInstPtr DynInstPtr;
59
60 int size;
61
62 DynInstPtr insts[Impl::MaxWidth];
63 Fault fetchFault;
64 InstSeqNum fetchFaultSN;
65 bool clearFetchFault;
66 };
Most importantly, it passes the instructions fetched from the Icache.
TimeBuffer::wire generic class representing wire
The information passed from the decode stage to fetch stage is represented as multiple wires conveying bits of information. For that purpose, GEM5 provides wire class.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
39 template <class T>
40 class TimeBuffer
41 {
42 protected:
43 int past;
44 int future;
45 unsigned size;
46 int _id;
47
48 char *data;
49 std::vector<char *> index;
50 unsigned base;
51
52 void valid(int idx) const
53 {
54 assert (idx >= -past && idx <= future);
55 }
56
57 public:
58 friend class wire;
59 class wire
60 {
61 friend class TimeBuffer;
62 protected:
63 TimeBuffer<T> *buffer;
64 int index;
65
66 void set(int idx)
67 {
68 buffer->valid(idx);
69 index = idx;
70 }
71
72 wire(TimeBuffer<T> *buf, int i)
73 : buffer(buf), index(i)
74 { }
75
76 public:
77 wire()
78 { }
79
80 wire(const wire &i)
81 : buffer(i.buffer), index(i.index)
82 { }
83
84 const wire &operator=(const wire &i)
85 {
86 buffer = i.buffer;
87 set(i.index);
88 return *this;
89 }
90
91 const wire &operator=(int idx)
92 {
93 set(idx);
94 return *this;
95 }
96
97 const wire &operator+=(int offset)
98 {
99 set(index + offset);
100 return *this;
101 }
102
103 const wire &operator-=(int offset)
104 {
105 set(index - offset);
106 return *this;
107 }
108
109 wire &operator++()
110 {
111 set(index + 1);
112 return *this;
113 }
114
115 wire &operator++(int)
116 {
117 int i = index;
118 set(index + 1);
119 return wire(this, i);
120 }
121
122 wire &operator--()
123 {
124 set(index - 1);
125 return *this;
126 }
127
128 wire &operator--(int)
129 {
130 int i = index;
131 set(index - 1);
132 return wire(this, i);
133 }
134 T &operator*() const { return *buffer->access(index); }
135 T *operator->() const { return buffer->access(index); }
136 };
......
192 protected:
193 //Calculate the index into this->index for element at position idx
194 //relative to now
195 inline int calculateVectorIndex(int idx) const
196 {
197 //Need more complex math here to calculate index.
198 valid(idx);
199
200 int vector_index = idx + base;
201 if (vector_index >= (int)size) {
202 vector_index -= size;
203 } else if (vector_index < 0) {
204 vector_index += size;
205 }
206
207 return vector_index;
208 }
209
210 public:
211 T *access(int idx)
212 {
213 int vector_index = calculateVectorIndex(idx);
214
215 return reinterpret_cast<T *>(index[vector_index]);
216 }
As shown in the Line 970 of the tick function of the fetch stage, it references insts member field through the -> operator. Because toDecode is declared as the TimeBuffer::wire, and this class overrides the -> operator, it will invoke the `operator function shown in line 135. \XXX{ it needs to be explained more clearly with smartpointer..}
Comments powered by Disqus.