From 1845c5948db89bfe4c9adc94aef02018baea2a09 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Tue, 2 Feb 2021 20:50:29 +0100 Subject: [PATCH] PPCAnalyst: Rework the store-safe logic The output of instructions like fabsx and ps_sel is store-safe if and only if the relevant inputs are. The old code was always marking the output as store-safe if the output was a single, and never otherwise. Also, the old code was treating the output of psq_l/psq_lu as store-safe, which seems incorrect (if dequantization is disabled). --- .../Interpreter/Interpreter_Tables.cpp | 28 ++++----- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 58 ++++++++++++++----- Source/Core/Core/PowerPC/PPCAnalyst.h | 5 +- Source/Core/Core/PowerPC/PPCTables.h | 5 ++ 4 files changed, 64 insertions(+), 32 deletions(-) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index e622d48e9a..9ba1235dfd 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -97,16 +97,16 @@ static std::array table4 = {{ //SUBOP10 {0, Interpreter::ps_cmpu0, {"ps_cmpu0", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}}, {32, Interpreter::ps_cmpo0, {"ps_cmpo0", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}}, - {40, Interpreter::ps_neg, {"ps_neg", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {136, Interpreter::ps_nabs, {"ps_nabs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {264, Interpreter::ps_abs, {"ps_abs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {40, Interpreter::ps_neg, {"ps_neg", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {136, Interpreter::ps_nabs, {"ps_nabs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {264, Interpreter::ps_abs, {"ps_abs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {64, Interpreter::ps_cmpu1, {"ps_cmpu1", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}}, - {72, Interpreter::ps_mr, {"ps_mr", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {72, Interpreter::ps_mr, {"ps_mr", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {96, Interpreter::ps_cmpo1, {"ps_cmpo1", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}}, - {528, Interpreter::ps_merge00, {"ps_merge00", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {560, Interpreter::ps_merge01, {"ps_merge01", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {592, Interpreter::ps_merge10, {"ps_merge10", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {624, Interpreter::ps_merge11, {"ps_merge11", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {528, Interpreter::ps_merge00, {"ps_merge00", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {560, Interpreter::ps_merge01, {"ps_merge01", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {592, Interpreter::ps_merge10, {"ps_merge10", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {624, Interpreter::ps_merge11, {"ps_merge11", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {1014, Interpreter::dcbz_l, {"dcbz_l", OpType::System, FL_IN_A0B | FL_LOADSTORE, 1, 0, 0, 0}}, }}; @@ -122,7 +122,7 @@ static std::array table4_2 = {18, Interpreter::ps_div, {"ps_div", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 17, 0, 0, 0}}, {20, Interpreter::ps_sub, {"ps_sub", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {21, Interpreter::ps_add, {"ps_add", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, - {23, Interpreter::ps_sel, {"ps_sel", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {23, Interpreter::ps_sel, {"ps_sel", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_IN_FLOAT_BC_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {24, Interpreter::ps_res, {"ps_res", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {25, Interpreter::ps_mul, {"ps_mul", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {26, Interpreter::ps_rsqrte, {"ps_rsqrte", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 2, 0, 0, 0}}, @@ -318,7 +318,7 @@ static std::array table59 = static std::array table63 = {{ - {264, Interpreter::fabsx, {"fabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {264, Interpreter::fabsx, {"fabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, // FIXME: fcmp modifies the FPRF flags, but if the flags are clobbered later, // we don't actually need to calculate or store them here. So FL_READ_FPRF and FL_SET_FPRF is not @@ -329,9 +329,9 @@ static std::array table63 = {14, Interpreter::fctiwx, {"fctiwx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {15, Interpreter::fctiwzx, {"fctiwzx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {72, Interpreter::fmrx, {"fmrx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {136, Interpreter::fnabsx, {"fnabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {40, Interpreter::fnegx, {"fnegx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {72, Interpreter::fmrx, {"fmrx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {136, Interpreter::fnabsx, {"fnabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_IN_FLOAT_B_BITEXACT | FL_USE_FPU, 1, 0, 0, 0}}, + {40, Interpreter::fnegx, {"fnegx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_IN_FLOAT_B_BITEXACT | FL_USE_FPU, 1, 0, 0, 0}}, {12, Interpreter::frspx, {"frspx", OpType::DoubleFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {64, Interpreter::mcrfs, {"mcrfs", OpType::SystemFP, FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}}, @@ -347,7 +347,7 @@ static std::array table63_2 = {18, Interpreter::fdivx, {"fdivx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 31, 0, 0, 0}}, {20, Interpreter::fsubx, {"fsubx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {21, Interpreter::faddx, {"faddx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, - {23, Interpreter::fselx, {"fselx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {23, Interpreter::fselx, {"fselx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_IN_FLOAT_BC_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {25, Interpreter::fmulx, {"fmulx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {26, Interpreter::frsqrtex, {"frsqrtex", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {28, Interpreter::fmsubx, {"fmsubx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index b826961051..9ed0e10b5d 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -971,36 +971,62 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: op.fprIsStoreSafe = fprIsStoreSafe; if (op.fregOut >= 0) { - fprIsSingle[op.fregOut] = false; - fprIsDuplicated[op.fregOut] = false; - fprIsStoreSafe[op.fregOut] = false; - // Single, duplicated, and doesn't need PPC_FP. if (op.opinfo->type == OpType::SingleFP) { fprIsSingle[op.fregOut] = true; fprIsDuplicated[op.fregOut] = true; - fprIsStoreSafe[op.fregOut] = true; } - // Single and duplicated, but might be a denormal (not safe to skip PPC_FP). - // TODO: if we go directly from a load to store, skip conversion entirely? - // TODO: if we go directly from a load to a float instruction, and the value isn't used - // for anything else, we can skip PPC_FP on a load too. - if (!strncmp(op.opinfo->opname, "lfs", 3)) + else if (!strncmp(op.opinfo->opname, "lfs", 3)) { fprIsSingle[op.fregOut] = true; fprIsDuplicated[op.fregOut] = true; } - // Paired are still floats, but the top/bottom halves may differ. - if (op.opinfo->type == OpType::PS || op.opinfo->type == OpType::LoadPS) + else if (op.opinfo->type == OpType::PS || op.opinfo->type == OpType::LoadPS) { fprIsSingle[op.fregOut] = true; - fprIsStoreSafe[op.fregOut] = true; + fprIsDuplicated[op.fregOut] = false; } - // Careful: changing the float mode in a block breaks this optimization, since - // a previous float op might have had had FTZ off while the later store has FTZ - // on. So, discard all information we have. + else + { + fprIsSingle[op.fregOut] = false; + fprIsDuplicated[op.fregOut] = false; + } + if (!strncmp(op.opinfo->opname, "mtfs", 4)) + { + // Careful: changing the float mode in a block breaks the store-safe optimization, + // since a previous float op might have had FTZ off while the later store has FTZ on. + // So, discard all information we have. fprIsStoreSafe = BitSet32(0); + } + else if (op.opinfo->flags & + (FL_IN_FLOAT_A_BITEXACT | FL_IN_FLOAT_B_BITEXACT | FL_IN_FLOAT_C_BITEXACT)) + { + // If the instruction copies bits between registers (without flushing denormals to zero + // or turning SNaN into QNaN), the output is store-safe if the inputs are. + + BitSet32 bitexact_inputs; + if (op.opinfo->flags & FL_IN_FLOAT_A_BITEXACT) + bitexact_inputs[code->inst.FA] = true; + if (op.opinfo->flags & FL_IN_FLOAT_B_BITEXACT) + bitexact_inputs[code->inst.FB] = true; + if (op.opinfo->flags & FL_IN_FLOAT_C_BITEXACT) + bitexact_inputs[code->inst.FC] = true; + + fprIsStoreSafe[op.fregOut] = (fprIsStoreSafe & bitexact_inputs) == bitexact_inputs; + } + else + { + // Other FPU instructions are store-safe if they perform a single-precision + // arithmetic operation. + + // TODO: if we go directly from a load to store, skip conversion entirely? + // TODO: if we go directly from a load to a float instruction, and the value isn't used + // for anything else, we can use fast single -> double conversion after the load. + + fprIsStoreSafe[op.fregOut] = + (op.opinfo->type == OpType::SingleFP || op.opinfo->type == OpType::PS); + } } if (op.opinfo->type == OpType::StorePS || op.opinfo->type == OpType::LoadPS) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 455e709d4e..6aad1f8430 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -61,8 +61,9 @@ struct CodeOp // 16B // instruction) BitSet32 fprIsDuplicated; // whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can - // safely - // skip PPC_FP. + // convert between single and double formats by just using the host machine's instruction for it. + // (The reason why we can't always do this is because some games rely on the exact bits of + // denormals and SNaNs being preserved as long as no arithmetic operation is performed on them.) BitSet32 fprIsStoreSafe; }; diff --git a/Source/Core/Core/PowerPC/PPCTables.h b/Source/Core/Core/PowerPC/PPCTables.h index f3aa89e290..c99573f3ce 100644 --- a/Source/Core/Core/PowerPC/PPCTables.h +++ b/Source/Core/Core/PowerPC/PPCTables.h @@ -59,6 +59,11 @@ enum FL_OUT_FLOAT_D = (1 << 28), // frD is used as a destination. // Used in the case of double ops (they don't modify the top half of the output) FL_INOUT_FLOAT_D = FL_IN_FLOAT_D | FL_OUT_FLOAT_D, + FL_IN_FLOAT_A_BITEXACT = (1 << 29), // The output is based on the exact bits in frA. + FL_IN_FLOAT_B_BITEXACT = (1 << 30), // The output is based on the exact bits in frB. + FL_IN_FLOAT_C_BITEXACT = (1 << 31), // The output is based on the exact bits in frC. + FL_IN_FLOAT_AB_BITEXACT = FL_IN_FLOAT_A_BITEXACT | FL_IN_FLOAT_B_BITEXACT, + FL_IN_FLOAT_BC_BITEXACT = FL_IN_FLOAT_B_BITEXACT | FL_IN_FLOAT_C_BITEXACT, }; enum class OpType