aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicolas Capens <capn@google.com>2021-02-05 15:18:42 -0500
committerNicolas Capens <nicolascapens@google.com>2021-02-20 03:50:35 +0000
commit0cfc043a97311d6ea24f9b2f6b2b3118b05012d4 (patch)
treee9941ab692525d5ba8085c4bb694b517ebd50edf
parent54313fbf3567496ee2fae9aea86ca9df6ba65a43 (diff)
downloadswiftshader-0cfc043a97311d6ea24f9b2f6b2b3118b05012d4.tar.gz
Implement scalar replacement of aggregates
This change implements the Scalar Replacement of Aggregates (SRoA) optimization. Since Reactor only supports array aggregates, this replaces arrays which are only indexed by static indices with individual variables for each element. The ReactorArray test is optimized from: sub rsp,38h mov dword ptr [rsp],1 mov dword ptr [rsp+4],2 mov eax,dword ptr [rsp] mov ecx,dword ptr [rsp+4] mov dword ptr [rsp],ecx mov dword ptr [rsp+4],eax mov eax,dword ptr [rsp+4] add eax,dword ptr [rsp] add rsp,38h ret Into: sub rsp,20h mov eax,2 add eax,1 add rsp,20h ret Which is identical to the CArray test's generated code. Bug: b/179279298 Change-Id: Ie45261f2ac783bdc22fee06adf03ebd588f3ebc3 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/52428 Presubmit-Ready: Nicolas Capens <nicolascapens@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Antonio Maiorano <amaiorano@google.com>
-rw-r--r--src/Reactor/Optimizer.cpp163
-rw-r--r--src/Reactor/SubzeroReactor.cpp4
-rw-r--r--tests/ReactorUnitTests/ReactorUnitTests.cpp88
3 files changed, 253 insertions, 2 deletions
diff --git a/src/Reactor/Optimizer.cpp b/src/Reactor/Optimizer.cpp
index 8ec7af1bb..acc9eae37 100644
--- a/src/Reactor/Optimizer.cpp
+++ b/src/Reactor/Optimizer.cpp
@@ -36,6 +36,7 @@ private:
void eliminateDeadCode();
void propagateAlloca();
+ void performScalarReplacementOfAggregates();
void eliminateUnitializedLoads();
void eliminateLoadsFollowingSingleStore();
void optimizeStoresInSingleBasicBlock();
@@ -43,6 +44,7 @@ private:
void replace(Ice::Inst *instruction, Ice::Operand *newValue);
void deleteInstruction(Ice::Inst *instruction);
bool isDead(Ice::Inst *instruction);
+ bool isStaticallyIndexedArray(Ice::Operand *allocaAddress);
static const Ice::InstIntrinsic *asLoadSubVector(const Ice::Inst *instruction);
static const Ice::InstIntrinsic *asStoreSubVector(const Ice::Inst *instruction);
@@ -113,6 +115,9 @@ void Optimizer::run(Ice::Cfg *function)
// Eliminate allocas which store the address of other allocas.
propagateAlloca();
+ // Replace arrays with individual elements if only statically indexed.
+ performScalarReplacementOfAggregates();
+
eliminateUnitializedLoads();
eliminateLoadsFollowingSingleStore();
optimizeStoresInSingleBasicBlock();
@@ -185,6 +190,117 @@ void Optimizer::propagateAlloca()
}
}
+Ice::Type pointerType()
+{
+ if(sizeof(void *) == 8)
+ {
+ return Ice::IceType_i64;
+ }
+ else
+ {
+ return Ice::IceType_i32;
+ }
+}
+
+// Replace arrays with individual elements if only statically indexed.
+void Optimizer::performScalarReplacementOfAggregates()
+{
+ std::vector<Ice::InstAlloca *> newAllocas;
+
+ Ice::CfgNode *entryBlock = function->getEntryNode();
+ Ice::InstList &instList = entryBlock->getInsts();
+
+ for(Ice::Inst &inst : instList)
+ {
+ if(inst.isDeleted())
+ {
+ continue;
+ }
+
+ auto *alloca = llvm::dyn_cast<Ice::InstAlloca>(&inst);
+
+ if(!alloca)
+ {
+ break; // Allocas are all at the top
+ }
+
+ uint32_t sizeInBytes = llvm::cast<Ice::ConstantInteger32>(alloca->getSizeInBytes())->getValue();
+ uint32_t alignInBytes = alloca->getAlignInBytes();
+
+ // This pass relies on array elements to be naturally aligned (i.e. matches the type size).
+ assert(sizeInBytes >= alignInBytes);
+ assert(sizeInBytes % alignInBytes == 0);
+ uint32_t elementCount = sizeInBytes / alignInBytes;
+
+ Ice::Operand *address = alloca->getDest();
+
+ if(isStaticallyIndexedArray(address))
+ {
+ // Delete the old array.
+ alloca->setDeleted();
+
+ // Allocate new stack slots for each element.
+ std::vector<Ice::Variable *> newAddress(elementCount);
+ auto *bytes = Ice::ConstantInteger32::create(context, Ice::IceType_i32, alignInBytes);
+
+ for(uint32_t i = 0; i < elementCount; i++)
+ {
+ newAddress[i] = function->makeVariable(pointerType());
+ auto *alloca = Ice::InstAlloca::create(function, newAddress[i], bytes, alignInBytes);
+ setDefinition(newAddress[i], alloca);
+
+ newAllocas.push_back(alloca);
+ }
+
+ Uses uses = *getUses(address); // Hard copy
+
+ for(auto *use : uses)
+ {
+ assert(!use->isDeleted());
+
+ if(isLoad(*use)) // Direct use of base address
+ {
+ use->replaceSource(asLoadSubVector(use) ? 1 : 0, newAddress[0]);
+ getUses(newAddress[0])->insert(newAddress[0], use);
+ }
+ else if(isStore(*use)) // Direct use of base address
+ {
+ use->replaceSource(asStoreSubVector(use) ? 2 : 1, newAddress[0]);
+ getUses(newAddress[0])->insert(newAddress[0], use);
+ }
+ else // Statically indexed use
+ {
+ auto *arithmetic = llvm::cast<Ice::InstArithmetic>(use);
+
+ if(arithmetic->getOp() == Ice::InstArithmetic::Add)
+ {
+ auto *rhs = arithmetic->getSrc(1);
+ int32_t offset = llvm::cast<Ice::ConstantInteger32>(rhs)->getValue();
+
+ assert(offset % alignInBytes == 0);
+ int32_t index = offset / alignInBytes;
+ assert(static_cast<uint32_t>(index) < elementCount);
+
+ replace(arithmetic, newAddress[index]);
+ }
+ else
+ assert(false && "Mismatch between isStaticallyIndexedArray() and scalarReplacementOfAggregates()");
+ }
+ }
+ }
+ }
+
+ // After looping over all the old allocas, add any new ones that replace them.
+ // They're added to the front in reverse order, to retain their original order.
+ for(size_t i = newAllocas.size(); i-- != 0;)
+ {
+ if(!isDead(newAllocas[i]))
+ {
+ instList.push_front(newAllocas[i]);
+ }
+ }
+}
+
void Optimizer::eliminateDeadCode()
{
bool modified;
@@ -582,6 +698,7 @@ void Optimizer::deleteInstruction(Ice::Inst *instruction)
return;
}
+ assert(!instruction->getDest() || getUses(instruction->getDest())->empty());
instruction->setDeleted();
for(Ice::SizeT i = 0; i < instruction->getSrcSize(); i++)
@@ -639,6 +756,52 @@ bool Optimizer::isDead(Ice::Inst *instruction)
return false;
}
+bool Optimizer::isStaticallyIndexedArray(Ice::Operand *allocaAddress)
+{
+ auto &uses = *getUses(allocaAddress);
+
+ for(auto *use : uses)
+ {
+ // Direct load from base address.
+ if(isLoad(*use) && use->getLoadAddress() == allocaAddress)
+ {
+ continue; // This is fine.
+ }
+
+ if(isStore(*use))
+ {
+ // Can either be the address we're storing to, or the data we're storing.
+ if(use->getStoreAddress() == allocaAddress)
+ {
+ continue;
+ }
+ else
+ {
+ // propagateAlloca() eliminates most of the stores of the address itself.
+ // For the cases it doesn't handle, assume SRoA is not feasible.
+ return false;
+ }
+ }
+
+ // Pointer arithmetic is fine if it only uses constants.
+ auto *arithmetic = llvm::dyn_cast<Ice::InstArithmetic>(use);
+ if(arithmetic && arithmetic->getOp() == Ice::InstArithmetic::Add)
+ {
+ auto *rhs = arithmetic->getSrc(1);
+
+ if(llvm::isa<Ice::Constant>(rhs))
+ {
+ continue;
+ }
+ }
+
+ // If there's any other type of use, bail out.
+ return false;
+ }
+
+ return true;
+}
+
const Ice::InstIntrinsic *Optimizer::asLoadSubVector(const Ice::Inst *instruction)
{
if(auto *instrinsic = llvm::dyn_cast<Ice::InstIntrinsic>(instruction))
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index ada10727a..1e0cf63f9 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -101,7 +101,7 @@ Ice::Variable *allocateStackVariable(Ice::Cfg *function, Ice::Type type, int arr
auto bytes = Ice::ConstantInteger32::create(function->getContext(), Ice::IceType_i32, totalSize);
auto address = function->makeVariable(getPointerType(type));
- auto alloca = Ice::InstAlloca::create(function, address, bytes, typeSize);
+ auto alloca = Ice::InstAlloca::create(function, address, bytes, typeSize); // SRoA depends on the alignment to match the type size.
function->getEntryNode()->getInsts().push_front(alloca);
return address;
@@ -1104,7 +1104,7 @@ Value *Nucleus::allocateStackVariable(Type *t, int arraySize)
auto bytes = Ice::ConstantInteger32::create(::context, Ice::IceType_i32, totalSize);
auto address = ::function->makeVariable(T(getPointerType(t)));
- auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize);
+ auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize); // SRoA depends on the alignment to match the type size.
::function->getEntryNode()->getInsts().push_front(alloca);
return V(address);
diff --git a/tests/ReactorUnitTests/ReactorUnitTests.cpp b/tests/ReactorUnitTests/ReactorUnitTests.cpp
index 51a75319d..fc8665a88 100644
--- a/tests/ReactorUnitTests/ReactorUnitTests.cpp
+++ b/tests/ReactorUnitTests/ReactorUnitTests.cpp
@@ -508,6 +508,94 @@ TEST(ReactorUnitTests, ArrayOfPointersToLocals)
EXPECT_EQ(result, 222);
}
+TEST(ReactorUnitTests, ModifyLocalThroughPointer)
+{
+ FunctionT<int(void)> function;
+ {
+ Int a = 1;
+
+ Pointer<Int> p = &a;
+ Pointer<Pointer<Int>> pp = &p;
+
+ Pointer<Int> q = *pp;
+ *q = 3;
+
+ Return(a);
+ }
+
+ auto routine = function(testName().c_str());
+
+ int result = routine();
+ EXPECT_EQ(result, 3);
+}
+
+TEST(ReactorUnitTests, ScalarReplacementOfArray)
+{
+ FunctionT<int(void)> function;
+ {
+ Array<Int, 2> a;
+ a[0] = 1;
+ a[1] = 2;
+
+ Return(a[0] + a[1]);
+ }
+
+ auto routine = function(testName().c_str());
+
+ int result = routine();
+ EXPECT_EQ(result, 3);
+}
+
+TEST(ReactorUnitTests, CArray)
+{
+ FunctionT<int(void)> function;
+ {
+ Int a[2];
+ a[0] = 1;
+ a[1] = 2;
+
+ auto x = a[0];
+ a[0] = a[1];
+ a[1] = x;
+
+ Return(a[0] + a[1]);
+ }
+
+ auto routine = function(testName().c_str());
+
+ int result = routine();
+ EXPECT_EQ(result, 3);
+}
+
+// SRoA should replace the array elements with scalars, which in turn enables
+// eliminating all loads and stores.
+TEST(ReactorUnitTests, ReactorArray)
+{
+ FunctionT<int(void)> function;
+ {
+ Array<Int, 2> a;
+ a[0] = 1;
+ a[1] = 2;
+
+ Int x = a[0];
+ a[0] = a[1];
+ a[1] = x;
+
+ Return(a[0] + a[1]);
+ }
+
+ Nucleus::setOptimizerCallback([](const Nucleus::OptimizerReport *report) {
+ EXPECT_EQ(report->allocas, 0);
+ EXPECT_EQ(report->loads, 0);
+ EXPECT_EQ(report->stores, 0);
+ });
+
+ auto routine = function(testName().c_str());
+
+ int result = routine();
+ EXPECT_EQ(result, 3);
+}
+
TEST(ReactorUnitTests, SubVectorLoadStore)
{
FunctionT<int(void *, void *)> function;