Merge branch 'optimization/idle-loop-detection'
@@ -18,6 +18,7 @@ - Implemented BIOS routines SoftReset, RegisterRamReset, Diff8bitUnFilterWram, Diff8bitUnFilterVram, and Diff16bitUnFilter
- Support IPv6 - Save directory of last loaded file - Support BPS patches + - Automatically detect and optimize out idle loops - Configurable game overrides - Support loading 7-Zip files - Drag and drop game loading
@@ -212,6 +212,18 @@
_lookupIntValue(config, "fullscreen", &opts->fullscreen); _lookupIntValue(config, "width", &opts->width); _lookupIntValue(config, "height", &opts->height); + + char* idleOptimization = 0; + if (_lookupCharValue(config, "idleOptimization", &idleOptimization)) { + if (strcasecmp(idleOptimization, "ignore") == 0) { + opts->idleOptimization = IDLE_LOOP_IGNORE; + } else if (strcasecmp(idleOptimization, "remove") == 0) { + opts->idleOptimization = IDLE_LOOP_REMOVE; + } else if (strcasecmp(idleOptimization, "detect") == 0) { + opts->idleOptimization = IDLE_LOOP_DETECT; + } + free(idleOptimization); + } } void GBAConfigLoadDefaults(struct GBAConfig* config, const struct GBAOptions* opts) {@@ -231,6 +243,18 @@ ConfigurationSetIntValue(&config->defaultsTable, 0, "width", opts->width);
ConfigurationSetIntValue(&config->defaultsTable, 0, "height", opts->height); ConfigurationSetIntValue(&config->defaultsTable, 0, "lockAspectRatio", opts->lockAspectRatio); ConfigurationSetIntValue(&config->defaultsTable, 0, "resampleVideo", opts->resampleVideo); + + switch (opts->idleOptimization) { + case IDLE_LOOP_IGNORE: + ConfigurationSetValue(&config->defaultsTable, 0, "idleOptimization", "ignore"); + break; + case IDLE_LOOP_REMOVE: + ConfigurationSetValue(&config->defaultsTable, 0, "idleOptimization", "remove"); + break; + case IDLE_LOOP_DETECT: + ConfigurationSetValue(&config->defaultsTable, 0, "idleOptimization", "detect"); + break; + } } void GBAConfigFreeOpts(struct GBAOptions* opts) {
@@ -8,6 +8,8 @@ #define GBA_CONFIG_H
#include "util/common.h" +#include "gba.h" + #include "util/configuration.h" struct GBAConfig {@@ -35,6 +37,8 @@ bool resampleVideo;
bool videoSync; bool audioSync; + + enum GBAIdleLoopOptimization idleOptimization; }; void GBAConfigInit(struct GBAConfig*, const char* port);
@@ -567,6 +567,7 @@ gba->memory.io[(address >> 1) + 1] = value >> 16;
} uint16_t GBAIORead(struct GBA* gba, uint32_t address) { + gba->lastJump = -1; // IO reads need to invalidate detected idle loops switch (address) { case REG_TM0CNT_LO: GBATimerUpdateRegister(gba, 0);
@@ -7,11 +7,14 @@ #include "gba-memory.h"
#include "macros.h" +#include "decoder.h" #include "gba-gpio.h" #include "gba-io.h" #include "gba-serialize.h" #include "hle-bios.h" #include "util/memory.h" + +#define IDLE_LOOP_THRESHOLD 10000 static uint32_t _popcount32(unsigned bits); static uint32_t _deadbeef[2] = { 0xDEADBEEF, 0xFEEDFACE };@@ -114,18 +117,118 @@ GBALog(gba, GBA_LOG_FATAL, "Could not map memory");
} } +static void _analyzeForIdleLoop(struct GBA* gba, struct ARMCore* cpu, uint32_t address) { + struct ARMInstructionInfo info; + uint32_t nextAddress = address; + memset(gba->taintedRegisters, 0, sizeof(gba->taintedRegisters)); + if (cpu->executionMode == MODE_THUMB) { + while (true) { + uint16_t opcode; + LOAD_16(opcode, nextAddress & cpu->memory.activeMask, cpu->memory.activeRegion); + ARMDecodeThumb(opcode, &info); + switch (info.branchType) { + case ARM_BRANCH_NONE: + if (info.operandFormat & ARM_OPERAND_MEMORY_2) { + if (info.mnemonic == ARM_MN_STR || gba->taintedRegisters[info.memory.baseReg]) { + gba->idleDetectionStep = -1; + return; + } + uint32_t loadAddress = gba->cachedRegisters[info.memory.baseReg]; + uint32_t offset = 0; + if (info.memory.format & ARM_MEMORY_IMMEDIATE_OFFSET) { + offset = info.memory.offset.immediate; + } else if (info.memory.format & ARM_MEMORY_REGISTER_OFFSET) { + int reg = info.memory.offset.reg; + if (gba->cachedRegisters[reg]) { + gba->idleDetectionStep = -1; + return; + } + offset = gba->cachedRegisters[reg]; + } + if (info.memory.format & ARM_MEMORY_OFFSET_SUBTRACT) { + loadAddress -= offset; + } else { + loadAddress += offset; + } + if ((loadAddress >> BASE_OFFSET) == REGION_IO) { + gba->idleDetectionStep = -1; + return; + } + if ((loadAddress >> BASE_OFFSET) < REGION_CART0 || (loadAddress >> BASE_OFFSET) > REGION_CART2_EX) { + gba->taintedRegisters[info.op1.reg] = true; + } else { + switch (info.memory.width) { + case 1: + gba->cachedRegisters[info.op1.reg] = GBALoad8(cpu, loadAddress, 0); + break; + case 2: + gba->cachedRegisters[info.op1.reg] = GBALoad16(cpu, loadAddress, 0); + break; + case 4: + gba->cachedRegisters[info.op1.reg] = GBALoad32(cpu, loadAddress, 0); + break; + } + } + } else if (info.operandFormat & ARM_OPERAND_AFFECTED_1) { + gba->taintedRegisters[info.op1.reg] = true; + } + nextAddress += WORD_SIZE_THUMB; + break; + case ARM_BRANCH: + if ((uint32_t) info.op1.immediate + nextAddress + WORD_SIZE_THUMB * 2 == address) { + gba->idleLoop = address; + gba->idleOptimization = IDLE_LOOP_REMOVE; + } + gba->idleDetectionStep = -1; + return; + default: + gba->idleDetectionStep = -1; + return; + } + } + } else { + gba->idleDetectionStep = -1; + } +} + static void GBASetActiveRegion(struct ARMCore* cpu, uint32_t address) { struct GBA* gba = (struct GBA*) cpu->master; struct GBAMemory* memory = &gba->memory; - if (address == gba->busyLoop && memory->activeRegion != REGION_BIOS) { - GBAHalt(gba); + int newRegion = address >> BASE_OFFSET; + if (gba->idleOptimization >= IDLE_LOOP_REMOVE && memory->activeRegion != REGION_BIOS) { + if (address == gba->lastJump && address == gba->idleLoop) { + GBAHalt(gba); + } else if (gba->idleOptimization >= IDLE_LOOP_DETECT && newRegion == memory->activeRegion) { + if (address == gba->lastJump) { + switch (gba->idleDetectionStep) { + case 0: + memcpy(gba->cachedRegisters, cpu->gprs, sizeof(gba->cachedRegisters)); + ++gba->idleDetectionStep; + break; + case 1: + if (memcmp(gba->cachedRegisters, cpu->gprs, sizeof(gba->cachedRegisters))) { + gba->idleDetectionStep = -1; + ++gba->idleDetectionFailures; + if (gba->idleDetectionFailures > IDLE_LOOP_THRESHOLD) { + gba->idleOptimization = IDLE_LOOP_IGNORE; + } + break; + } + _analyzeForIdleLoop(gba, cpu, address); + break; + } + } else { + gba->idleDetectionStep = 0; + } + } } - int newRegion = address >> BASE_OFFSET; + gba->lastJump = address; if (newRegion == memory->activeRegion) { return; } + if (memory->activeRegion == REGION_BIOS) { memory->biosPrefetch = cpu->prefetch[1]; }
@@ -26,7 +26,7 @@ { "V49J", SAVEDATA_SRAM, GPIO_RUMBLE, -1 },
{ "V49E", SAVEDATA_SRAM, GPIO_RUMBLE, -1 }, // Final Fantasy Tactics Advance - { "AFXE", SAVEDATA_FLASH512, GPIO_NONE, 0x8000418 }, + { "AFXE", SAVEDATA_FLASH512, GPIO_NONE, 0x8000428 }, // Koro Koro Puzzle - Happy Panechu! { "KHPJ", SAVEDATA_EEPROM, GPIO_TILT, -1 },@@ -240,6 +240,9 @@ }
} if (override->idleLoop != 0xFFFFFFFF) { - gba->busyLoop = override->idleLoop; + gba->idleLoop = override->idleLoop; + if (gba->idleOptimization == IDLE_LOOP_DETECT) { + gba->idleOptimization = IDLE_LOOP_REMOVE; + } } }
@@ -120,6 +120,7 @@ ARMInit(&cpu);
gba.sync = &threadContext->sync; threadContext->gba = &gba; gba.logLevel = threadContext->logLevel; + gba.idleOptimization = threadContext->idleOptimization; #ifdef USE_PTHREADS pthread_setspecific(_contextKey, threadContext); #else@@ -260,6 +261,8 @@
if (opts->audioBuffers) { threadContext->audioBuffers = opts->audioBuffers; } + + threadContext->idleOptimization = opts->idleOptimization; } void GBAMapArgumentsToContext(const struct GBAArguments* args, struct GBAThread* threadContext) {
@@ -72,6 +72,7 @@ const char* fname;
int activeKeys; struct GBAAVStream* stream; struct Configuration* overrides; + enum GBAIdleLoopOptimization idleOptimization; bool hasOverride; struct GBACartridgeOverride override;
@@ -76,7 +76,11 @@ gba->logLevel = GBA_LOG_INFO | GBA_LOG_WARN | GBA_LOG_ERROR | GBA_LOG_FATAL;
gba->biosChecksum = GBAChecksum(gba->memory.bios, SIZE_BIOS); - gba->busyLoop = -1; + gba->idleOptimization = IDLE_LOOP_REMOVE; + gba->idleLoop = -1; + gba->lastJump = 0; + gba->idleDetectionStep = 0; + gba->idleDetectionFailures = 0; gba->performingDMA = false; }
@@ -75,6 +75,12 @@ GBA_COMPONENT_DEBUGGER,
GBA_COMPONENT_MAX }; +enum GBAIdleLoopOptimization { + IDLE_LOOP_IGNORE = -1, + IDLE_LOOP_REMOVE = 0, + IDLE_LOOP_DETECT +}; + enum { SP_BASE_SYSTEM = 0x03007F00, SP_BASE_IRQ = 0x03007FA0,@@ -120,7 +126,6 @@
int springIRQ; uint32_t biosChecksum; int* keySource; - uint32_t busyLoop; struct GBARotationSource* rotationSource; struct GBALuminanceSource* luminanceSource; struct GBARTCSource* rtcSource;@@ -136,6 +141,14 @@
const char* activeFile; int logLevel; + + enum GBAIdleLoopOptimization idleOptimization; + uint32_t idleLoop; + uint32_t lastJump; + int idleDetectionStep; + int idleDetectionFailures; + int32_t cachedRegisters[16]; + bool taintedRegisters[16]; }; struct GBACartridge {
@@ -56,7 +56,11 @@ struct GBAConfig config;
GBAConfigInit(&config, "perf"); GBAConfigLoad(&config); - struct GBAOptions opts = {}; + struct GBAOptions opts = { + .idleOptimization = IDLE_LOOP_DETECT + }; + GBAConfigLoadDefaults(&config, &opts); + struct GBAArguments args = {}; if (!parseArguments(&args, &config, argc, argv, &subparser)) { usage(argv[0], PERF_USAGE);@@ -69,7 +73,7 @@
renderer.outputBuffer = malloc(256 * 256 * 4); renderer.outputBufferStride = 256; - struct GBAThread context = { }; + struct GBAThread context = {}; _thread = &context; if (!perfOpts.noVideo) {