Optimize AttributeBuffer to OutputVertex conversion (#3283)

Optimize AttributeBuffer to OutputVertex conversion

First I unrolled the inner loop, then I pushed semantics validation
outside of the hotloop.

I also added overflow slots to avoid conditional branches.

Super Mario 3D Land's intro runs at almost full speed when compiled with
Clang, and theres a noticible speed increase in MSVC. GCC hasn't been
tested but I'm confident in its ability to optimize this code.
This commit is contained in:
Dwayne Slater 2018-01-02 18:32:33 -05:00 committed by Yuri Kunde Schlesner
parent 3f7f2b42c0
commit 41929371dc
4 changed files with 34 additions and 18 deletions

View file

@ -221,6 +221,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
MICROPROFILE_SCOPE(GPU_Drawing);
immediate_attribute_id = 0;
Shader::OutputVertex::ValidateSemantics(regs.rasterizer);
auto* shader_engine = Shader::GetEngine();
shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
@ -289,6 +291,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
// Later, these can be compiled and cached.
const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
VertexLoader loader(regs.pipeline);
Shader::OutputVertex::ValidateSemantics(regs.rasterizer);
// Load vertices
bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));

View file

@ -87,6 +87,8 @@ struct RasterizerRegs {
BitField<8, 5, Semantic> map_y;
BitField<16, 5, Semantic> map_z;
BitField<24, 5, Semantic> map_w;
u32 raw;
} vs_output_attributes[7];
INSERT_PADDING_WORDS(0xe);

View file

@ -2,6 +2,7 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cinttypes>
#include <cmath>
#include <cstring>
#include "common/bit_set.h"
@ -21,32 +22,41 @@ namespace Pica {
namespace Shader {
void OutputVertex::ValidateSemantics(const RasterizerRegs& regs) {
unsigned int num_attributes = regs.vs_output_total;
ASSERT(num_attributes <= 7);
for (size_t attrib = 0; attrib < num_attributes; ++attrib) {
u32 output_register_map = regs.vs_output_attributes[attrib].raw;
for (size_t comp = 0; comp < 4; ++comp) {
u32 semantic = (output_register_map >> (8 * comp)) & 0x1F;
ASSERT_MSG(semantic < 24 || semantic == RasterizerRegs::VSOutputAttributes::INVALID,
"Invalid/unknown semantic id: %" PRIu32, semantic);
}
}
}
OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
const AttributeBuffer& input) {
// Setup output data
union {
OutputVertex ret{};
std::array<float24, 24> vertex_slots;
// Allow us to overflow OutputVertex to avoid branches, since
// RasterizerRegs::VSOutputAttributes::INVALID would write to slot 31, which
// would be out of bounds otherwise.
std::array<float24, 32> vertex_slots_overflow;
};
static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes.");
unsigned int num_attributes = regs.vs_output_total;
ASSERT(num_attributes <= 7);
for (unsigned int i = 0; i < num_attributes; ++i) {
const auto& output_register_map = regs.vs_output_attributes[i];
// Assert that OutputVertex has enough space for 24 semantic registers
static_assert(sizeof(std::array<float24, 24>) == sizeof(ret),
"Struct and array have different sizes.");
RasterizerRegs::VSOutputAttributes::Semantic semantics[4] = {
output_register_map.map_x, output_register_map.map_y, output_register_map.map_z,
output_register_map.map_w};
for (unsigned comp = 0; comp < 4; ++comp) {
RasterizerRegs::VSOutputAttributes::Semantic semantic = semantics[comp];
if (semantic < vertex_slots.size()) {
vertex_slots[semantic] = input.attr[i][comp];
} else if (semantic != RasterizerRegs::VSOutputAttributes::INVALID) {
LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic);
}
}
unsigned int num_attributes = regs.vs_output_total & 7;
for (size_t attrib = 0; attrib < num_attributes; ++attrib) {
const auto output_register_map = regs.vs_output_attributes[attrib];
vertex_slots_overflow[output_register_map.map_x] = input.attr[attrib][0];
vertex_slots_overflow[output_register_map.map_y] = input.attr[attrib][1];
vertex_slots_overflow[output_register_map.map_z] = input.attr[attrib][2];
vertex_slots_overflow[output_register_map.map_w] = input.attr[attrib][3];
}
// The hardware takes the absolute and saturates vertex colors like this, *before* doing

View file

@ -50,6 +50,7 @@ struct OutputVertex {
INSERT_PADDING_WORDS(1);
Math::Vec2<float24> tc2;
static void ValidateSemantics(const RasterizerRegs& regs);
static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
const AttributeBuffer& output);
};