GBA Memory: Implement game pak prefetch approximation
@@ -110,8 +110,7 @@ uint32_t activeSeqCycles32;
uint32_t activeSeqCycles16; uint32_t activeNonseqCycles32; uint32_t activeNonseqCycles16; - uint32_t activeUncachedCycles32; - uint32_t activeUncachedCycles16; + int32_t (*stall)(struct ARMCore*, int32_t wait); void (*setActiveRegion)(struct ARMCore*, uint32_t address); };
@@ -259,7 +259,7 @@
#define ADDR_MODE_4_WRITEBACK_STM cpu->gprs[rn] = address; #define ARM_LOAD_POST_BODY \ - currentCycles += 1 + cpu->memory.activeNonseqCycles32 - cpu->memory.activeSeqCycles32; \ + currentCycles += cpu->memory.activeNonseqCycles32 - cpu->memory.activeSeqCycles32; \ if (rd == ARM_PC) { \ ARM_WRITE_PC; \ }@@ -567,7 +567,7 @@ ARM_STORE_POST_BODY;)
DEFINE_LOAD_STORE_MULTIPLE_INSTRUCTION_ARM(LDM, load, - currentCycles += 1 + cpu->memory.activeNonseqCycles32 - cpu->memory.activeSeqCycles32; + currentCycles += cpu->memory.activeNonseqCycles32 - cpu->memory.activeSeqCycles32; if (rs & 0x8000) { ARM_WRITE_PC; })
@@ -36,14 +36,18 @@ #define ARM_V_ADDITION(M, N, D) (!(ARM_SIGN((M) ^ (N))) && (ARM_SIGN((M) ^ (D))) && (ARM_SIGN((N) ^ (D))))
#define ARM_V_SUBTRACTION(M, N, D) ((ARM_SIGN((M) ^ (N))) && (ARM_SIGN((M) ^ (D)))) #define ARM_WAIT_MUL(R) \ - if ((R & 0xFFFFFF00) == 0xFFFFFF00 || !(R & 0xFFFFFF00)) { \ - currentCycles += 1; \ - } else if ((R & 0xFFFF0000) == 0xFFFF0000 || !(R & 0xFFFF0000)) { \ - currentCycles += 2; \ - } else if ((R & 0xFF000000) == 0xFF000000 || !(R & 0xFF000000)) { \ - currentCycles += 3; \ - } else { \ - currentCycles += 4; \ + { \ + int32_t wait; \ + if ((R & 0xFFFFFF00) == 0xFFFFFF00 || !(R & 0xFFFFFF00)) { \ + wait = 1; \ + } else if ((R & 0xFFFF0000) == 0xFFFF0000 || !(R & 0xFFFF0000)) { \ + wait = 2; \ + } else if ((R & 0xFF000000) == 0xFF000000 || !(R & 0xFF000000)) { \ + wait = 3; \ + } else { \ + wait = 4; \ + } \ + currentCycles += cpu->memory.stall(cpu, wait); \ } #define ARM_STUB cpu->irqh.hitStub(cpu, opcode)@@ -55,7 +59,7 @@ cpu->memory.setActiveRegion(cpu, cpu->gprs[ARM_PC]); \
LOAD_32(cpu->prefetch[0], cpu->gprs[ARM_PC] & cpu->memory.activeMask, cpu->memory.activeRegion); \ cpu->gprs[ARM_PC] += WORD_SIZE_ARM; \ LOAD_32(cpu->prefetch[1], cpu->gprs[ARM_PC] & cpu->memory.activeMask, cpu->memory.activeRegion); \ - currentCycles += 2 + cpu->memory.activeUncachedCycles32 + cpu->memory.activeSeqCycles32; + currentCycles += 2 + cpu->memory.activeNonseqCycles32 + cpu->memory.activeSeqCycles32; #define THUMB_WRITE_PC \ cpu->gprs[ARM_PC] = (cpu->gprs[ARM_PC] & -WORD_SIZE_THUMB); \@@ -63,7 +67,7 @@ cpu->memory.setActiveRegion(cpu, cpu->gprs[ARM_PC]); \
LOAD_16(cpu->prefetch[0], cpu->gprs[ARM_PC] & cpu->memory.activeMask, cpu->memory.activeRegion); \ cpu->gprs[ARM_PC] += WORD_SIZE_THUMB; \ LOAD_16(cpu->prefetch[1], cpu->gprs[ARM_PC] & cpu->memory.activeMask, cpu->memory.activeRegion); \ - currentCycles += 2 + cpu->memory.activeUncachedCycles16 + cpu->memory.activeSeqCycles16; + currentCycles += 2 + cpu->memory.activeNonseqCycles16 + cpu->memory.activeSeqCycles16; static inline int _ARMModeHasSPSR(enum PrivilegeMode mode) { return mode != MODE_SYSTEM && mode != MODE_USER;
@@ -42,7 +42,7 @@
#define THUMB_PREFETCH_CYCLES (1 + cpu->memory.activeSeqCycles16) #define THUMB_LOAD_POST_BODY \ - currentCycles += 1 + cpu->memory.activeNonseqCycles16 - cpu->memory.activeSeqCycles16; + currentCycles += cpu->memory.activeNonseqCycles16 - cpu->memory.activeSeqCycles16; #define THUMB_STORE_POST_BODY \ currentCycles += cpu->memory.activeNonseqCycles16 - cpu->memory.activeSeqCycles16;
@@ -91,7 +91,7 @@ gba->haltPending = false;
gba->idleDetectionStep = 0; gba->idleDetectionFailures = 0; - gba->realisticTiming = false; + gba->realisticTiming = true; gba->performingDMA = false; }
@@ -22,6 +22,7 @@ static uint32_t _deadbeef[1] = { 0xE710B710 }; // Illegal instruction on both ARM and Thumb
static void GBASetActiveRegion(struct ARMCore* cpu, uint32_t region); static void GBAMemoryServiceDMA(struct GBA* gba, int number, struct GBADMA* info); +static int32_t GBAMemoryStall(struct ARMCore* cpu, int32_t wait); static const char GBA_BASE_WAITSTATES[16] = { 0, 0, 2, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4 }; static const char GBA_BASE_WAITSTATES_32[16] = { 0, 0, 5, 0, 0, 1, 1, 0, 7, 7, 9, 9, 13, 13, 9 };@@ -41,6 +42,7 @@ cpu->memory.store32 = GBAStore32;
cpu->memory.store16 = GBAStore16; cpu->memory.store8 = GBAStore8; cpu->memory.storeMultiple = GBAStoreMultiple; + cpu->memory.stall = GBAMemoryStall; gba->memory.bios = (uint32_t*) hleBios; gba->memory.fullBios = 0;@@ -76,8 +78,6 @@ cpu->memory.activeSeqCycles32 = 0;
cpu->memory.activeSeqCycles16 = 0; cpu->memory.activeNonseqCycles32 = 0; cpu->memory.activeNonseqCycles16 = 0; - cpu->memory.activeUncachedCycles32 = 0; - cpu->memory.activeUncachedCycles16 = 0; gba->memory.biosPrefetch = 0; }@@ -278,12 +278,10 @@ GBALog(gba, GBA_LOG_FATAL, "Jumped to invalid address");
} return; } - cpu->memory.activeSeqCycles32 = memory->waitstatesPrefetchSeq32[newRegion]; - cpu->memory.activeSeqCycles16 = memory->waitstatesPrefetchSeq16[newRegion]; - cpu->memory.activeNonseqCycles32 = memory->waitstatesPrefetchNonseq32[newRegion]; - cpu->memory.activeNonseqCycles16 = memory->waitstatesPrefetchNonseq16[newRegion]; - cpu->memory.activeUncachedCycles32 = memory->waitstatesNonseq32[newRegion]; - cpu->memory.activeUncachedCycles16 = memory->waitstatesNonseq16[newRegion]; + cpu->memory.activeSeqCycles32 = memory->waitstatesSeq32[memory->activeRegion]; + cpu->memory.activeSeqCycles16 = memory->waitstatesSeq16[memory->activeRegion]; + cpu->memory.activeNonseqCycles32 = memory->waitstatesNonseq32[memory->activeRegion]; + cpu->memory.activeNonseqCycles16 = memory->waitstatesNonseq16[memory->activeRegion]; } #define LOAD_BAD \@@ -412,7 +410,11 @@ break;
} if (cycleCounter) { - *cycleCounter += 1 + wait; + wait += 2; + if (address >> BASE_OFFSET < REGION_CART0) { + wait = GBAMemoryStall(cpu, wait); + } + *cycleCounter += wait; } // Unaligned 32-bit loads are "rotated" so they make some semblance of sense int rotate = (address & 3) << 3;@@ -503,7 +505,11 @@ break;
} if (cycleCounter) { - *cycleCounter += 1 + wait; + wait += 2; + if (address >> BASE_OFFSET < REGION_CART0) { + wait = GBAMemoryStall(cpu, wait); + } + *cycleCounter += wait; } // Unaligned 16-bit loads are "unpredictable", but the GBA rotates them, so we have to, too. int rotate = (address & 1) << 3;@@ -595,7 +601,11 @@ break;
} if (cycleCounter) { - *cycleCounter += 1 + wait; + wait += 2; + if (address >> BASE_OFFSET < REGION_CART0) { + wait = GBAMemoryStall(cpu, wait); + } + *cycleCounter += wait; } return value; }@@ -682,7 +692,11 @@ break;
} if (cycleCounter) { - *cycleCounter += 1 + wait; + ++wait; + if (address >> BASE_OFFSET < REGION_CART0) { + wait = GBAMemoryStall(cpu, wait); + } + *cycleCounter += wait; } }@@ -742,7 +756,11 @@ break;
} if (cycleCounter) { - *cycleCounter += 1 + wait; + ++wait; + if (address >> BASE_OFFSET < REGION_CART0) { + wait = GBAMemoryStall(cpu, wait); + } + *cycleCounter += wait; } }@@ -808,7 +826,11 @@ break;
} if (cycleCounter) { - *cycleCounter += 1 + wait; + ++wait; + if (address >> BASE_OFFSET < REGION_CART0) { + wait = GBAMemoryStall(cpu, wait); + } + *cycleCounter += wait; } }@@ -1100,6 +1122,10 @@ break;
} if (cycleCounter) { + ++wait; + if (address >> BASE_OFFSET < REGION_CART0) { + wait = GBAMemoryStall(cpu, wait); + } *cycleCounter += wait; }@@ -1206,6 +1232,9 @@ break;
} if (cycleCounter) { + if (address >> BASE_OFFSET < REGION_CART0) { + wait = GBAMemoryStall(cpu, wait); + } *cycleCounter += wait; }@@ -1253,50 +1282,13 @@ memory->waitstatesSeq32[REGION_CART0] = memory->waitstatesSeq32[REGION_CART0_EX] = 2 * memory->waitstatesSeq16[REGION_CART0] + 1;
memory->waitstatesSeq32[REGION_CART1] = memory->waitstatesSeq32[REGION_CART1_EX] = 2 * memory->waitstatesSeq16[REGION_CART1] + 1; memory->waitstatesSeq32[REGION_CART2] = memory->waitstatesSeq32[REGION_CART2_EX] = 2 * memory->waitstatesSeq16[REGION_CART2] + 1; - if (!prefetch) { - memory->waitstatesPrefetchSeq16[REGION_CART0] = memory->waitstatesPrefetchSeq16[REGION_CART0_EX] = memory->waitstatesSeq16[REGION_CART0]; - memory->waitstatesPrefetchSeq16[REGION_CART1] = memory->waitstatesPrefetchSeq16[REGION_CART1_EX] = memory->waitstatesSeq16[REGION_CART1]; - memory->waitstatesPrefetchSeq16[REGION_CART2] = memory->waitstatesPrefetchSeq16[REGION_CART2_EX] = memory->waitstatesSeq16[REGION_CART2]; + memory->prefetch = prefetch; - memory->waitstatesPrefetchSeq32[REGION_CART0] = memory->waitstatesPrefetchSeq32[REGION_CART0_EX] = memory->waitstatesSeq32[REGION_CART0]; - memory->waitstatesPrefetchSeq32[REGION_CART1] = memory->waitstatesPrefetchSeq32[REGION_CART1_EX] = memory->waitstatesSeq32[REGION_CART1]; - memory->waitstatesPrefetchSeq32[REGION_CART2] = memory->waitstatesPrefetchSeq32[REGION_CART2_EX] = memory->waitstatesSeq32[REGION_CART2]; + cpu->memory.activeSeqCycles32 = memory->waitstatesSeq32[memory->activeRegion]; + cpu->memory.activeSeqCycles16 = memory->waitstatesSeq16[memory->activeRegion]; - memory->waitstatesPrefetchNonseq16[REGION_CART0] = memory->waitstatesPrefetchNonseq16[REGION_CART0_EX] = memory->waitstatesNonseq16[REGION_CART0]; - memory->waitstatesPrefetchNonseq16[REGION_CART1] = memory->waitstatesPrefetchNonseq16[REGION_CART1_EX] = memory->waitstatesNonseq16[REGION_CART1]; - memory->waitstatesPrefetchNonseq16[REGION_CART2] = memory->waitstatesPrefetchNonseq16[REGION_CART2_EX] = memory->waitstatesNonseq16[REGION_CART2]; - - memory->waitstatesPrefetchNonseq32[REGION_CART0] = memory->waitstatesPrefetchNonseq32[REGION_CART0_EX] = memory->waitstatesNonseq32[REGION_CART0]; - memory->waitstatesPrefetchNonseq32[REGION_CART1] = memory->waitstatesPrefetchNonseq32[REGION_CART1_EX] = memory->waitstatesNonseq32[REGION_CART1]; - memory->waitstatesPrefetchNonseq32[REGION_CART2] = memory->waitstatesPrefetchNonseq32[REGION_CART2_EX] = memory->waitstatesNonseq32[REGION_CART2]; - } else { - // Assume it stalls one cycle to pull a value from the prefetch - // This needs more research to tell if it's accurate or not - memory->waitstatesPrefetchSeq16[REGION_CART0] = memory->waitstatesPrefetchSeq16[REGION_CART0_EX] = 1; - memory->waitstatesPrefetchSeq16[REGION_CART1] = memory->waitstatesPrefetchSeq16[REGION_CART1_EX] = 1; - memory->waitstatesPrefetchSeq16[REGION_CART2] = memory->waitstatesPrefetchSeq16[REGION_CART2_EX] = 1; - - memory->waitstatesPrefetchSeq32[REGION_CART0] = memory->waitstatesPrefetchSeq32[REGION_CART0_EX] = 2; - memory->waitstatesPrefetchSeq32[REGION_CART1] = memory->waitstatesPrefetchSeq32[REGION_CART1_EX] = 2; - memory->waitstatesPrefetchSeq32[REGION_CART2] = memory->waitstatesPrefetchSeq32[REGION_CART2_EX] = 2; - - memory->waitstatesPrefetchNonseq16[REGION_CART0] = memory->waitstatesPrefetchNonseq16[REGION_CART0_EX] = 1; - memory->waitstatesPrefetchNonseq16[REGION_CART1] = memory->waitstatesPrefetchNonseq16[REGION_CART1_EX] = 1; - memory->waitstatesPrefetchNonseq16[REGION_CART2] = memory->waitstatesPrefetchNonseq16[REGION_CART2_EX] = 1; - - memory->waitstatesPrefetchNonseq32[REGION_CART0] = memory->waitstatesPrefetchNonseq32[REGION_CART0_EX] = 2; - memory->waitstatesPrefetchNonseq32[REGION_CART1] = memory->waitstatesPrefetchNonseq32[REGION_CART1_EX] = 2; - memory->waitstatesPrefetchNonseq32[REGION_CART2] = memory->waitstatesPrefetchNonseq32[REGION_CART2_EX] = 2; - } - - cpu->memory.activeSeqCycles32 = memory->waitstatesPrefetchSeq32[memory->activeRegion]; - cpu->memory.activeSeqCycles16 = memory->waitstatesPrefetchSeq16[memory->activeRegion]; - - cpu->memory.activeNonseqCycles32 = memory->waitstatesPrefetchNonseq32[memory->activeRegion]; - cpu->memory.activeNonseqCycles16 = memory->waitstatesPrefetchNonseq16[memory->activeRegion]; - - cpu->memory.activeUncachedCycles32 = memory->waitstatesNonseq32[memory->activeRegion]; - cpu->memory.activeUncachedCycles16 = memory->waitstatesNonseq16[memory->activeRegion]; + cpu->memory.activeNonseqCycles32 = memory->waitstatesNonseq32[memory->activeRegion]; + cpu->memory.activeNonseqCycles16 = memory->waitstatesNonseq16[memory->activeRegion]; } void GBAMemoryWriteDMASAD(struct GBA* gba, int dma, uint32_t address) {@@ -1526,6 +1518,31 @@ if (info->nextEvent != INT_MAX) {
info->nextEvent += cycles; } cpu->cycles += cycles; +} + +int32_t GBAMemoryStall(struct ARMCore* cpu, int32_t wait) { + struct GBA* gba = (struct GBA*) cpu->master; + struct GBAMemory* memory = &gba->memory; + + if (!memory->prefetch || memory->activeRegion < REGION_CART0) { + return wait; + } + + int32_t stall = 5 - memory->waitstatesSeq16[memory->activeRegion]; // Figure out where this value comes from + + // Base number of cycles for this insn is N + int32_t base = memory->waitstatesSeq16[memory->activeRegion] + 1; + if (cpu->executionMode == MODE_ARM) { + base <<= 1; + } + if (base <= wait) { + --base; + } else { + base = wait; + } + + cpu->cycles -= stall + base - 1; + return wait; } void GBAMemorySerialize(const struct GBAMemory* memory, struct GBASerializedState* state) {
@@ -131,6 +131,7 @@ char waitstatesPrefetchSeq16[16];
char waitstatesPrefetchNonseq32[16]; char waitstatesPrefetchNonseq16[16]; int activeRegion; + bool prefetch; uint32_t biosPrefetch; struct GBADMA dma[4];