diff options
author | Yang Ni <yangni@google.com> | 2015-01-12 13:03:40 -0800 |
---|---|---|
committer | Yang Ni <yangni@google.com> | 2015-02-02 17:37:03 -0800 |
commit | da0f069871343119251d6b0586be356dc2146a62 (patch) | |
tree | 3b3727ac5ff30a598b33a926e2536cc7c6190b05 /cpu_ref | |
parent | f5e39bba1d0e7be4adfa7364d92ba3f541420fdd (diff) | |
download | rs-da0f069871343119251d6b0586be356dc2146a62.tar.gz |
Runtime support for compiler kernel fusion.
The runtime will start a seperate process to call the new bcc to fuse kernels.
Change-Id: Ia73ea917a126a5055ec97f13d90a5feaafd6a2f5
Diffstat (limited to 'cpu_ref')
-rw-r--r-- | cpu_ref/rsCpuScript.cpp | 19 | ||||
-rw-r--r-- | cpu_ref/rsCpuScript.h | 15 | ||||
-rw-r--r-- | cpu_ref/rsCpuScriptGroup2.cpp | 274 | ||||
-rw-r--r-- | cpu_ref/rsCpuScriptGroup2.h | 31 |
4 files changed, 293 insertions, 46 deletions
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp index f975094f..01fbcafd 100644 --- a/cpu_ref/rsCpuScript.cpp +++ b/cpu_ref/rsCpuScript.cpp @@ -161,15 +161,13 @@ static bool is_force_recompile() { #endif // RS_SERVER } -const static char *BCC_EXE_PATH = "/system/bin/bcc"; - static void setCompileArguments(std::vector<const char*>* args, const std::string& bcFileName, const char* cacheDir, const char* resName, const char* core_lib, bool useRSDebugContext, const char* bccPluginName) { rsAssert(cacheDir && resName && core_lib); - args->push_back(BCC_EXE_PATH); + args->push_back(android::renderscript::RsdCpuScriptImpl::BCC_EXE_PATH); args->push_back("-unroll-runtime"); args->push_back("-scalarize-load-store"); args->push_back("-o"); @@ -234,7 +232,8 @@ static bool compileBitcode(const std::string &bcFileName, } case 0: { // Child process ALOGV("Invoking BCC with: %s", compileCommandLine.c_str()); - execv(BCC_EXE_PATH, (char* const*)compileArguments); + execv(android::renderscript::RsdCpuScriptImpl::BCC_EXE_PATH, + (char* const*)compileArguments); ALOGE("execv() failed: %s", strerror(errno)); abort(); @@ -428,6 +427,8 @@ void* SharedLibraryUtils::loadSOHelper(const char *origName, const char *cacheDi return loaded; } +const char* RsdCpuScriptImpl::BCC_EXE_PATH = "/system/bin/bcc"; + #define MAXLINE 500 #define MAKE_STR_HELPER(S) #S #define MAKE_STR(S) MAKE_STR_HELPER(S) @@ -764,12 +765,8 @@ error: #ifndef RS_COMPATIBILITY_LIB for (size_t idx = 0; idx < pragmaCount; ++idx) { - if (pragmaKeys[idx] != nullptr) { - delete [] pragmaKeys[idx]; - } - if (pragmaValues[idx] != nullptr) { - delete [] pragmaValues[idx]; - } + delete [] pragmaKeys[idx]; + delete [] pragmaValues[idx]; } delete[] pragmaValues; @@ -867,6 +864,8 @@ bool RsdCpuScriptImpl::init(char const *resName, char const *cacheDir, } } + mBitcodeFilePath = bcFileName; + // Read RS symbol information from the .so. if ( !mScriptSO) { goto error; diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h index e4ae4d36..39bf53e8 100644 --- a/cpu_ref/rsCpuScript.h +++ b/cpu_ref/rsCpuScript.h @@ -237,6 +237,13 @@ protected: Allocation **mBoundAllocs; void * mIntrinsicData; bool mIsThreadable; + + public: + static const char* BCC_EXE_PATH; + const std::string& getBitcodeFilePath() const { return mBitcodeFilePath; } + + private: + std::string mBitcodeFilePath; }; Allocation * rsdScriptGetAllocationForPointer( @@ -244,10 +251,14 @@ Allocation * rsdScriptGetAllocationForPointer( const Script *script, const void *); - - } +#ifdef __LP64__ +#define SYSLIBPATH "/system/lib64" +#else +#define SYSLIBPATH "/system/lib" +#endif + } #endif diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp index 52cd8a02..90907d05 100644 --- a/cpu_ref/rsCpuScriptGroup2.cpp +++ b/cpu_ref/rsCpuScriptGroup2.cpp @@ -1,5 +1,15 @@ #include "rsCpuScriptGroup2.h" +#include <dlfcn.h> + +#include <string> +#include <vector> + +#ifndef RS_COMPATIBILITY_LIB +#include "bcc/Config/Config.h" +#include <sys/wait.h> +#endif + #include "cpu_ref/rsCpuCore.h" #include "rsClosure.h" #include "rsContext.h" @@ -7,13 +17,17 @@ #include "rsCpuScript.h" #include "rsScript.h" #include "rsScriptGroup2.h" +#include "rsScriptIntrinsic.h" + +using std::string; +using std::vector; namespace android { namespace renderscript { namespace { -static const size_t DefaultKernelArgCount = 2; +const size_t DefaultKernelArgCount = 2; void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart, uint32_t xend, uint32_t outstep) { @@ -66,25 +80,45 @@ void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart, mutable_kparams->usr = &closures; } -/* - Returns true if closure depends on any closure in batch via a glboal variable - TODO: this probably should go into class Closure. - */ -bool conflict(const list<CPUClosure*> &batch, CPUClosure* closure) { +} // namespace + +Batch::~Batch() { + for (CPUClosure* c : mClosures) { + delete c; + } + if (mScriptObj) { + dlclose(mScriptObj); + } +} + +bool Batch::conflict(CPUClosure* closure) const { + if (mClosures.empty()) { + return false; + } + + if (closure->mClosure->mKernelID.get() == nullptr || + mClosures.front()->mClosure->mKernelID.get() == nullptr) { + // An invoke should be in a batch by itself, so it conflicts with any other + // closure. + return true; + } + for (const auto &p : closure->mClosure->mGlobalDeps) { const Closure* dep = p.first; - for (CPUClosure* c : batch) { + for (CPUClosure* c : mClosures) { if (c->mClosure == dep) { + ALOGV("ScriptGroup2: closure %p conflicting with closure %p via its global", closure, dep); return true; } } } for (const auto &p : closure->mClosure->mArgDeps) { const Closure* dep = p.first; - for (CPUClosure* c : batch) { + for (CPUClosure* c : mClosures) { if (c->mClosure == dep) { for (const auto &p1 : *p.second) { - if (p1.second != nullptr) { + if (p1.second->get() != nullptr) { + ALOGV("ScriptGroup2: closure %p conflicting with closure %p via its arg", closure, dep); return true; } } @@ -94,12 +128,10 @@ bool conflict(const list<CPUClosure*> &batch, CPUClosure* closure) { return false; } -} // namespace - CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, const ScriptGroupBase *sg) : mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) { - list<CPUClosure*>* batch = new list<CPUClosure*>(); + Batch* batch = new Batch(this); for (Closure* closure: mGroup->mClosures) { const ScriptKernelID* kernelID = closure->mKernelID.get(); RsdCpuScriptImpl* si = @@ -110,32 +142,192 @@ CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, // TODO: Is mtls.fep.usrLen ever used? CPUClosure* cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel, mtls.fep.usr, mtls.fep.usrLen); - if (conflict(*batch, cc)) { + if (batch->conflict(cc)) { mBatches.push_back(batch); - batch = new list<CPUClosure*>(); + batch = new Batch(this); } - batch->push_back(cc); + + batch->mClosures.push_back(cc); } + mBatches.push_back(batch); + +#ifndef RS_COMPATIBILITY_LIB + for (Batch* batch : mBatches) { + batch->tryToCreateFusedKernel(mGroup->mCacheDir.c_str()); + } +#endif } CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { - for (list<CPUClosure*>* batch : mBatches) { - for (CPUClosure* c : *batch) { - delete c; + for (Batch* batch : mBatches) { + delete batch; + } +} + +namespace { + +#ifndef RS_COMPATIBILITY_LIB + +string getFileName(string path) { + unsigned found = path.find_last_of("/\\"); + return path.substr(found + 1); +} + +void setupCompileArguments( + const vector<string>& inputs, const vector<int>& kernels, + const string& output_dir, const string& output_filename, + const string& rsLib, vector<const char*>* args) { + args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); + args->push_back("-fPIC"); + args->push_back("-embedRSInfo"); + args->push_back("-mtriple"); + args->push_back(DEFAULT_TARGET_TRIPLE_STRING); + args->push_back("-bclib"); + args->push_back(rsLib.c_str()); + for (const string& input : inputs) { + args->push_back(input.c_str()); + } + for (int kernel : kernels) { + args->push_back("-k"); + string strKernel = std::to_string(kernel); + args->push_back(strKernel.c_str()); + } + args->push_back("-output_path"); + args->push_back(output_dir.c_str()); + args->push_back("-o"); + args->push_back(output_filename.c_str()); + args->push_back(nullptr); +} + +string convertListToString(int n, const char* const* strs) { + string ret; + ret.append(strs[0]); + for (int i = 1; i < n; i++) { + ret.append(" "); + ret.append(strs[i]); + } + return ret; +} + +bool fuseAndCompile(const char** arguments, + const string& commandLine) { + const pid_t pid = fork(); + + if (pid == -1) { + ALOGE("Couldn't fork for bcc execution"); + return false; + } + + if (pid == 0) { + // Child process + ALOGV("Invoking BCC with: %s", commandLine.c_str()); + execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments); + + ALOGE("execv() failed: %s", strerror(errno)); + abort(); + return false; + } + + // Parent process + int status = 0; + const pid_t w = waitpid(pid, &status, 0); + if (w == -1) { + return false; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) { + ALOGE("bcc terminated unexpectedly"); + return false; + } + + return true; +} +#endif + +} // anonymous namespace + +void Batch::tryToCreateFusedKernel(const char *cacheDir) { +#ifndef RS_COMPATIBILITY_LIB + if (mClosures.size() < 2) { + ALOGV("Compiler kernel fusion skipped due to only one or zero kernel in" + " a script group batch."); + return; + } + + //===--------------------------------------------------------------------===// + // Fuse the input kernels and generate native code in an object file + //===--------------------------------------------------------------------===// + + std::vector<string> inputFiles; + std::vector<int> slots; + + for (CPUClosure* cpuClosure : mClosures) { + const Closure* closure = cpuClosure->mClosure; + const ScriptKernelID* kernelID = closure->mKernelID.get(); + const Script* script = kernelID->mScript; + + if (script->isIntrinsic()) { + return; } + + const RsdCpuScriptImpl *cpuScript = + (const RsdCpuScriptImpl*)script->mHal.drv; + + const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); + + inputFiles.push_back(bitcodeFilename); + slots.push_back(kernelID->mSlot); + } + + string outputPath(tempnam(cacheDir, "fused")); + string outputFileName = getFileName(outputPath); + string objFilePath(outputPath); + objFilePath.append(".o"); + string rsLibPath(SYSLIBPATH"/libclcore.bc"); + vector<const char*> arguments; + setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath, + &arguments); + string commandLine = + convertListToString(arguments.size() - 1, arguments.data()); + + if (!fuseAndCompile(arguments.data(), commandLine)) { + return; + } + + //===--------------------------------------------------------------------===// + // Create and load the shared lib + //===--------------------------------------------------------------------===// + + const char* resName = outputFileName.c_str(); + + if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) { + ALOGE("Failed to link object file '%s'", resName); + return; + } + + void* mSharedObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); + if (mSharedObj == nullptr) { + ALOGE("Unable to load '%s'", resName); + return; } + + mExecutable = ScriptExecutable::createFromSharedObject( + nullptr, // RS context. Unused. + mSharedObj); + +#endif // RS_COMPATIBILITY_LIB } void CpuScriptGroup2Impl::execute() { - for (list<CPUClosure*>* batch : mBatches) { - setGlobalsForBatch(*batch); - runBatch(*batch); + for (auto batch : mBatches) { + batch->setGlobalsForBatch(); + batch->run(); } } -void CpuScriptGroup2Impl::setGlobalsForBatch(const list<CPUClosure*>& batch) { - for (CPUClosure* cpuClosure : batch) { +void Batch::setGlobalsForBatch() { + for (CPUClosure* cpuClosure : mClosures) { const Closure* closure = cpuClosure->mClosure; const ScriptKernelID* kernelID = closure->mKernelID.get(); Script* s = kernelID->mScript; @@ -152,8 +344,32 @@ void CpuScriptGroup2Impl::setGlobalsForBatch(const list<CPUClosure*>& batch) { } } -void CpuScriptGroup2Impl::runBatch(const list<CPUClosure*>& batch) { - for (CPUClosure* cpuClosure : batch) { +void Batch::run() { + if (mExecutable != nullptr) { + MTLaunchStruct mtls; + const CPUClosure* firstCpuClosure = mClosures.front(); + const CPUClosure* lastCpuClosure = mClosures.back(); + + firstCpuClosure->mSi->forEachMtlsSetup( + (const Allocation**)&firstCpuClosure->mClosure->mArgs[0], + firstCpuClosure->mClosure->mArgs.size(), + lastCpuClosure->mClosure->mReturnValue, + nullptr, 0, nullptr, &mtls); + + mtls.script = nullptr; + mtls.fep.usr = nullptr; + mtls.kernel = mExecutable->getForEachFunction(0); + + mGroup->getCpuRefImpl()->launchThreads( + (const Allocation**)&firstCpuClosure->mClosure->mArgs[0], + firstCpuClosure->mClosure->mArgs.size(), + lastCpuClosure->mClosure->mReturnValue, + nullptr, &mtls); + + return; + } + + for (CPUClosure* cpuClosure : mClosures) { const Closure* closure = cpuClosure->mClosure; const ScriptKernelID* kernelID = closure->mKernelID.get(); cpuClosure->mSi->preLaunch(kernelID->mSlot, @@ -163,7 +379,7 @@ void CpuScriptGroup2Impl::runBatch(const list<CPUClosure*>& batch) { nullptr); } - const CPUClosure* cpuClosure = batch.front(); + const CPUClosure* cpuClosure = mClosures.front(); const Closure* closure = cpuClosure->mClosure; MTLaunchStruct mtls; @@ -174,12 +390,12 @@ void CpuScriptGroup2Impl::runBatch(const list<CPUClosure*>& batch) { mtls.script = nullptr; mtls.kernel = (void (*)())&groupRoot; - mtls.fep.usr = &batch; + mtls.fep.usr = &mClosures; - mCpuRefImpl->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); + mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); } - for (CPUClosure* cpuClosure : batch) { + for (CPUClosure* cpuClosure : mClosures) { const Closure* closure = cpuClosure->mClosure; const ScriptKernelID* kernelID = closure->mKernelID.get(); cpuClosure->mSi->postLaunch(kernelID->mSlot, diff --git a/cpu_ref/rsCpuScriptGroup2.h b/cpu_ref/rsCpuScriptGroup2.h index 6cb72a65..53a0fe59 100644 --- a/cpu_ref/rsCpuScriptGroup2.h +++ b/cpu_ref/rsCpuScriptGroup2.h @@ -13,6 +13,7 @@ namespace renderscript { class Closure; class RsdCpuScriptImpl; class RsdCpuReferenceImpl; +class ScriptExecutable; class ScriptGroup2; struct RsExpandKernelParams; @@ -36,6 +37,28 @@ class CPUClosure { const size_t mUsrSize; }; +class CpuScriptGroup2Impl; + +class Batch { + public: + Batch(CpuScriptGroup2Impl* group) : mGroup(group), mExecutable(nullptr) {} + + ~Batch(); + + // Returns true if closure depends on any closure in this batch for a global + // variable + bool conflict(CPUClosure* closure) const; + + void tryToCreateFusedKernel(const char* cacheDir); + void setGlobalsForBatch(); + void run(); + + CpuScriptGroup2Impl* mGroup; + ScriptExecutable* mExecutable; + void* mScriptObj; + list<CPUClosure*> mClosures; +}; + class CpuScriptGroup2Impl : public RsdCpuReference::CpuScriptGroup2 { public: CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, const ScriptGroupBase* group); @@ -44,14 +67,12 @@ class CpuScriptGroup2Impl : public RsdCpuReference::CpuScriptGroup2 { bool init(); virtual void execute(); - private: - void setGlobalsForBatch(const list<CPUClosure*>& batch); - void runBatch(const list<CPUClosure*>& batch); + RsdCpuReferenceImpl* getCpuRefImpl() const { return mCpuRefImpl; } + private: RsdCpuReferenceImpl* mCpuRefImpl; const ScriptGroup2* mGroup; - - list<list<CPUClosure*>*> mBatches; + list<Batch*> mBatches; }; } // namespace renderscript |