1、createSession
依据 ScheduleConfig 和 RuntimeInfo 创建会话。
// source/core/Interpreter.cpp Session* Interpreter::createSession(const ScheduleConfig& config, const RuntimeInfo& runtime) { return createMultiPathSession({config}, runtime); }
1.1 createMultiPathSession
// source/core/Interpreter.cpp Session* Interpreter::createMultiPathSession(const std::vector<ScheduleConfig>& configs, const RuntimeInfo& runtime) { // ... auto result = newSession.get(); auto validForResize = info.validForResize; if (validForResize && mNet->modes.inputMode == Session_Input_Inside && mNet->modes.resizeMode == Session_Resize_Direct) { result->resize(); } // ... return result; }
1.1.1 Session::resize
// source/core/Session.cpp ErrorCode Session::resize() { // ... if (mNeedResize) { bool debug = mCallBackMode == Interpreter::Session_Debug; // mPipelines 类型为 std::vector<std::shared_ptr<Pipeline>> for (auto& iter : mPipelines) { auto error = iter->encode(debug, permitCodegen); if (NO_ERROR != error) { return error; } } mNeedResize = false; mNeedMalloc = true; firstMalloc = true; } // ... }
1.1.1.1 Pipeline::encode
// source/core/Pipeline.cpp // typedef std::pair<BackendCache, std::vector<OpCacheInfo>> PipelineInfo; // // struct BackendCache { // Backend::Info info; // BackendConfig config; // std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>> cache; // bool needComputeShape = true; // bool needComputeGeometry = true; // bool reportError = true; // std::map<Tensor*, TENSORCACHE> inputTensorCopyCache; // }; // // /** pipeline info */ // struct OpCacheInfo { // /** op */ // const Op* op; // /** input tensors */ // std::vector<Tensor*> inputs; // /** output tensors */ // std::vector<Tensor*> outputs; // /** schedule type*/ // Schedule::Type type = Schedule::Type::SEPARATE; // // /**Command buffer for cache*/ // CommandBuffer cacheBuffer; // // /**Command buffer for execute*/ // CommandBuffer executeBuffer; // // std::map<const Op*, std::shared_ptr<Execution>> executionCache; // }; // ErrorCode Pipeline::encode(bool supportDebug, bool permitCodegen) { // mInfo.first.cache 类型为 std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>> // mBackend 创建的后端如(VulkanBackend) auto& mBackend = mInfo.first.cache.first; // mBackupBackend 创建的后备(默认)后端如(CPUBackend) auto& mBackupBackend = mInfo.first.cache.second; // Static Model just copy info to command buffer // mInfo.first 类型为 BackendCache if (!mInfo.first.needComputeGeometry) { // ... } else { #ifndef MNN_BUILD_MINI // mContext 类型为 GeometryComputer::Context mContext.clear(); /** Size Compute and compute Const Begin */ auto res = GeometryComputerUtils::shapeComputeAndGeometryTransform(mInfo.second, mContext, mInfo.first.cache.second, mUseGeometry, false, permitCodegen); if (res != NO_ERROR) { return res; } #endif } // ... return NO_ERROR; }
1.1.1.1.1 GeometryComputerUtils::shapeComputeAndGeometryTransform
GeometryComputerUtils::shapeComputeAndGeometryTransform 完整代码
// source/geometry/GeometryComputerUtils.cpp // /** pipeline info */ // struct OpCacheInfo { // /** op */ // const Op* op; // /** input tensors */ // std::vector<Tensor*> inputs; // /** output tensors */ // std::vector<Tensor*> outputs; // /** schedule type*/ // Schedule::Type type = Schedule::Type::SEPARATE; // // /**Command buffer for cache*/ // CommandBuffer cacheBuffer; // // /**Command buffer for execute*/ // CommandBuffer executeBuffer; // // std::map<const Op*, std::shared_ptr<Execution>> executionCache; // }; // ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform( std::vector<Schedule::OpCacheInfo>& infos, GeometryComputer::Context& geoContext, std::shared_ptr<Backend> backupBackend, Runtime::CompilerType compileType, bool skipShapeCompute, bool permitCodegen) { /** Size Compute and compute Const Begin */ GeometryComputer::Context ctx(backupBackend); // Size Compute and compute Const // infos 为算子缓存,大小为 171 for (int i=0; i<infos.size(); ++i) { // info 类型为 OpCacheInfo auto& info = infos[i]; auto& cmdBufferVir = info.executeBuffer; auto& tempBuffer = info.cacheBuffer; // ... if (info.type == Schedule::CONSTANT) { // ... for (auto& cp : cmdBufferVir.command) { auto& c = *cp; if (nullptr == c.execution) { c.execution.reset(backupBackend->onCreate(c.inputs, c.outputs, c.op)); } auto exe = c.execution; if (nullptr == exe.get()) { MNN_ERROR("Const Folder Error for %s\n", info.op->name()->c_str()); return NO_EXECUTION; } for (auto t : c.outputs) { auto des = TensorUtils::getDescribe(t); TensorUtils::setLinearLayout(t); auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC); if (!res) { return OUT_OF_MEMORY; } des->setBackend(backupBackend.get()); } backupBackend->onResizeBegin(); auto code = exe->onResize(c.inputs, c.outputs); if (NO_ERROR != code) { return NOT_SUPPORT; } code = backupBackend->onResizeEnd(); if (NO_ERROR != code) { return NOT_SUPPORT; } code = exe->onExecute(c.inputs, c.outputs); if (NO_ERROR != code) { return NOT_SUPPORT; } } // Clear const command ctx.pushCache(cmdBufferVir); cmdBufferVir.command.clear(); cmdBufferVir.extras.clear(); } } /** Size Compute and compute Const End */ // ... return NO_ERROR; }
1.1.1.1.1.1 CPUBackend::onCreate
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用
CPUBackend::onCreate 函数的代码如下:
for (auto& cp : cmdBufferVir.command) { auto& c = *cp; if (nullptr == c.execution) { c.execution.reset(backupBackend->onCreate(c.inputs, c.outputs, c.op)); }
由于传入的 backupBackend 是 CPUBackend(继承 Backend)。CPUBackend::onCreate 具体实现代码如下:
// source/backend/cpu/CPUBackend.cpp /// get execution Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) { /** BatchNorm it will be converted to scale for model convert, don't print error log */ if (op->type() == OpType_BatchNorm) { return nullptr; } auto opType = op->type(); if (outputs.size() > 0) { if (TensorUtils::getDescribe(outputs[0])->quantAttr != nullptr && TensorUtils::getDescribe(outputs[0])->type == DataType_DT_INT8) { opType = _getRealOpType(opType); } } // TODO: rm this convert when merge diff datatyoe of op auto map = gCreator; auto iter = map->find(opType); if (iter == map->end()) { MNN_PRINT("Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name()->c_str()); return nullptr; } Execution* exe = nullptr; bool needCast = false; if (exe == nullptr) { exe = iter->second->onCreate(inputs, outputs, op, this); } return exe; }
1.1.1.1.1.1.1 CPUBackend::Creator::onCreate
在函数 CPUBackend::onCreate 中调用 CPUBackend::Creator::onCreate 函数的代码如下:
auto map = gCreator; auto iter = map->find(opType); // ... Execution* exe = nullptr; bool needCast = false; if (exe == nullptr) { // 根据 opType 创建算子执行器 exe = iter->second->onCreate(inputs, outputs, op, this); }
备注:iter->second->onCreate 调用是个多态,实际运行中根据算子类型 opType ,调用不同的子类。其基类为 CPUBackend::Creator 。
其中一个实现类为 CPURasterFactory ,具体实现代码如下:
// source/backend/cpu/CPURaster.cpp class CPURasterFactory : public CPUBackend::Creator { public: virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* backend) const { if (op->type() == OpType_While) { if (op->main_type() != OpParameter_LoopParam) { return nullptr; } return new CPULoop(backend, op->main_as_LoopParam()); } return new CPURaster(backend); } }; REGISTER_CPU_OP_CREATOR(CPURasterFactory, OpType_Raster); REGISTER_CPU_OP_CREATOR(CPURasterFactory, OpType_While);
1.1.1.1.1.1.2 Backend
Backend 为后端基类,其具体实现如下:
// source/core/Backend.hpp /** abstract backend */ class Backend : public NonCopyable { public: /** info used to create backend */ struct Info { /** forward type. */ MNNForwardType type = MNN_FORWARD_CPU; /** numThread for CPU . number of threads. gpuMode for GPU only. tuning/memory Mode setting. */ union { int numThread = 4; int gpuMode; }; /** user data. */ BackendConfig* user = NULL; enum Mode { // The Op will be run in execution->onExecute DIRECT = 0, // The Op will be recorded. Run in onExecuteBegin and Wait in onExecuteEnd INDIRECT = 1 }; Mode mode = DIRECT; enum Allocator { DEFER = 0, EAGER = 1 }; Allocator allocator = DEFER; }; /** backend buffer storage type */ enum StorageType { /** use NOT reusable memory. - allocates memory when `onAcquireBuffer` is called. - releases memory when `onReleaseBuffer` is called or when the backend is deleted. - do NOTHING when `onClearBuffer` is called. */ STATIC, /** use reusable memory. - allocates or reuses memory when `onAcquireBuffer` is called. prefers reusing. - collects memory for reuse when `onReleaseBuffer` is called. - releases memory when `onClearBuffer` is called or when the backend is deleted. */ DYNAMIC, /** use NOT reusable memory. - allocates memory when `onAcquireBuffer` is called. - do NOTHING when `onReleaseBuffer` is called. - releases memory when `onClearBuffer` is called or when the backend is deleted. */ DYNAMIC_SEPERATE }; public: /** * @brief initializer. * @param type forward type. */ Backend(MNNForwardType type) : mType(type) { // nothing to do } /** * @brief deinitializer. */ virtual ~Backend() = default; public: /** * @brief create execution for op with input and output tensors. * @param inputs input tensors. * @param outputs output tensors. * @param op given op. * @return created execution if op is supported, nullptr otherwise. */ virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) = 0; /** * @brief callback before resize ops. */ virtual void onResizeBegin() { // nothing to do } /** * @brief callback after resize ops. */ virtual ErrorCode onResizeEnd() = 0; /** * @brief callback before executing ops. */ virtual void onExecuteBegin() const = 0; /** * @brief callback after executing ops. */ virtual void onExecuteEnd() const = 0; virtual const Runtime* getRuntime() { return nullptr; } const std::string externalFile(); public: /** * @brief allocate buffer of tensor for given storage type. * @param tensor buffer provider. * @param storageType buffer storage type. * @return success or not. */ MNN_PUBLIC bool onAcquireBuffer(const Tensor* tensor, StorageType storageType); /** * @brief release buffer of tensor for given storage type. * @param tensor buffer provider. * @param storageType buffer storage type. * @return success or not. */ MNN_PUBLIC bool onReleaseBuffer(const Tensor* tensor, StorageType storageType); class MemObj { public: MemObj() {} virtual ~ MemObj() {} virtual MemChunk chunk() { return MemChunk(); } }; /** * @brief allocate buffer of tensor for given storage type. * @param tensor buffer provider. * @param storageType buffer storage type. * @return MemObj for release, if failed, return nullptr. */ virtual MemObj* onAcquire(const Tensor* tensor, StorageType storageType) = 0; /** * @brief get buffer from tensor directly * @param tensor buffer provider. * @return support or not */ virtual bool onGetTensorInfo(const Tensor* tensor, void* dstInfo) { return false; } /** * @brief clear all dynamic buffers. * @return success or not. */ virtual bool onClearBuffer() = 0; /** * @brief copy buffer from tensor to tensor. * @param srcTensor source buffer provider. * @param dstTensor dest buffer provider. */ virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const = 0; public: /** * @brief get forward type. * @return forward type. */ inline MNNForwardType type() const { return mType; } public: /** * @brief get Gpu Tensor map host ptr/ unmap */ virtual void* onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) { return nullptr; } virtual bool onUnmapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* dstTensor, void* mapPtr) { return false; } virtual int onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) { return 0; } private: const MNNForwardType mType; };
1.1.1.1.1.1.3 Execution
Execution 为具体的算子执行实现,其具体实现如下:
// source/core/Execution.hpp /** abstract execution */ class Execution : public NonCopyable { public: /** * @brief initializer. * @param backend backend that exection will running on. */ Execution() = delete; Execution(Backend *backend) : mBackEnd(backend) { // nothing to do } /** * @brief deinitializer. */ virtual ~Execution() = default; /** * @brief response shape change of input or output tensors. * @param inputs input tensors * @param outputs output tensors * @return resize result */ virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { return NO_ERROR; } /** * @brief perform execution. * @param inputs input tensors * @param outputs output tensors * @return execution result */ virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) = 0; /** * @brief clone execution, new execution will share weight from this execution * @param bn the cloned' execution's backend * @param dst if dst = nullptr, just return whether execution can clone, otherwise clone the execution into dst * @return execution result */ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) { return false; } public: /** * @brief designed for plugin system. not ready yet. */ class Creator : public NonCopyable { public: /** * @brief deinitializer. */ virtual ~Creator() = default; /** * @brief create execution for given op on given backend. * @param backend given backend. * @param op given op. * @return execution. */ virtual Execution *onCreate(Backend *backend, const Op *op) const = 0; }; // Search for extra creator, if not found, return nullptr MNN_PUBLIC static const Creator *searchExtraCreator(const std::string &key, MNNForwardType type); /** * @brief register creator for given key and backend type. * @param creator registering creator. * @param key given key. * @param type given backend type. * @return false if registered creator for same key and type exists, true otherwise. */ MNN_PUBLIC static bool insertExtraCreator(std::shared_ptr<Creator> creator, const std::string &key, MNNForwardType type); /** * @brief unregister creator for given key and backend type. * @param key given key. * @param type given backend type. * @return true if registered creator for given key and type exists, false otherwise. */ MNN_PUBLIC static bool removeExtraCreator(const std::string &key, MNNForwardType type); public: /** * @brief check if execution is valid. * @return valid or not. */ inline bool valid() const { return mValid; } /** * @brief get backend. * @return backend. */ Backend *backend() const { return mBackEnd; } protected: bool mValid = true; private: Backend *mBackEnd; };
1.1.1.1.1.1.4 CPU 算子执行实例注册
CPUBackend::onCreate 函数中有个 gCreator 成员,其缓存了所有的 CPU 算子执行创建实
例 CPUBackend::Creator,其初始化与注册在 registerBackend 函数中调用 registerCPURuntimeCreator 来实现的。
// source/core/BackendRegister.cpp static std::once_flag s_flag; void registerBackend() { std::call_once(s_flag, [&]() { // ... registerCPURuntimeCreator(); // ... }); }
registerCPURuntimeCreator() 实现如下:
// source/backend/cpu/CPUBackend.cpp void registerCPURuntimeCreator() { CPUBackend::initCreatorMap(); registerCPUOps(); #ifdef MNN_SUPPORT_BF16 registerBF16Backend(); #endif #ifdef MNN_USE_ARMV82 registerArm82RuntimeCreator(); #endif // TODO: Merge _initCoreFunction MNNFunctionInit and cpuinfo_arm_init MNNCoreFunctionInit(); MNNInsertExtraRuntimeCreator(MNN_FORWARD_CPU, new CPURuntimeCreator); };
registerCPUOps 函数注册了所有的 CPU 的算子执行实例 Execution,其代码如下:
// source/backend/cpu/CPUOPRegister.cpp void registerCPUOps() { ___CPUCropAndResizeCreator__OpType_CropAndResize__(); ___CPUArgMaxCreator__OpType_ArgMax__(); ___CPUArgMaxCreator__OpType_ArgMin__(); // ... }
函数 ___CPUArgMaxCreator__OpType_ArgMax__ 是通过 REGISTER_CPU_OP_CREATOR 宏定义的:
// source/backend/cpu/CPUBackend.hpp #define REGISTER_CPU_OP_CREATOR(name, opType) \ void ___##name##__##opType##__() { \ static name _temp;\ CPUBackend::addCreator(opType, &_temp); \ }
其实现代码如下:
// source/backend/cpu/CPUArgMax.cpp class CPUArgMaxCreator : public CPUBackend::Creator { public: virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const MNN::Op *op, Backend *backend) const { auto argMax = op->main_as_ArgMax(); if (op->type() == OpType_ArgMin) { return new CPUArgMax(backend, CPUArgMax::ArgMinOrMax::ARGMIN, argMax->topK(), argMax->outMaxVal(), argMax->softmaxThreshold(), argMax->axis()); } else { return new CPUArgMax(backend, CPUArgMax::ArgMinOrMax::ARGMAX, argMax->topK(), argMax->outMaxVal(), argMax->softmaxThreshold(), argMax->axis()); } } }; REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMax); REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMin);
REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMax) 宏扩展如下:
// REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMax) void ___CPUArgMaxCreator__OpType_ArgMax__() { static CPUArgMaxCreator _temp; CPUBackend::addCreator(OpType_ArgMax, &_temp); }
注册是通过函数 CPUBackend::addCreator 实现的,其实现如下:
// source/backend/cpu/CPUBackend.cpp bool CPUBackend::addCreator(OpType t, Creator* c) { auto map = gCreator; if (map->find(t) != map->end()) { MNN_PRINT("Error: %d type has be added\n", t); return false; } map->insert(std::make_pair(t, c)); return true; }
由代码可知,创建器最终注册到 gCreator 中。
综上可见,扩展后的代码正是一个函数,函数名 ___CPUArgMaxCreator__OpType_ArgMax__呼应了 registerCPUOps 函数中的调用。gCreator 呼应了 CPUBackend::onCreate 函数的实现。
添加CPU实现
在source/backend/CPU
目录下添加CPUMyCustomOp.hpp
、CPUMyCustomOp.cpp
。
- 实现类声明
class CPUMyCustomOp : public Execution { public: // 若执行onExecute需要使用缓存,在此函数中申请,若无可不声明 virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; // 具体的Op执行函数 virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; };
2.实现onResize
和onExecute
在onResize
中,调用backend()->onAcquireBuffer(&mCache, Backend::DYNAMIC)
进行缓存的申请,调用backend()->onReleaseBuffer(&mCache, Backend::DYNAMIC)
回收缓存。释放后的内存可以被复用。
在onExecute
中,做必要的输入的检查,有利于提前发现问题。若执行完毕正确返回NO_ERROR。
3.注册实现类
class CPUMyCustomOpCreator : public CPUBackend::Creator { public: virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const MNN::Op *op, Backend *backend) const override { return new CPUMyCustomOp(backend); } }; REGISTER_CPU_OP_CREATOR(CPUMyCustomOpCreator, OpType_MyCustomOp);
1.1.1.1.1.2 Backend::onAcquireBuffer
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用
Backend::onAcquireBuffer 函数的代码如下:
auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC);
onAcquireBuffer 只存在 Backend 基类中,其主要用来为张量 tensor 分配内存,具体实现代码如下:
bool Backend::onAcquireBuffer(const Tensor* tensor, StorageType storageType) { auto mem = this->onAcquire(tensor, storageType); if (nullptr == mem) { return false; } if (mem == TensorUtils::getDescribe(tensor)->mem.get()) { return true; } TensorUtils::getDescribe(tensor)->mem.reset(mem); return true; }
onAcquireBuffer 函数中调用 onAcquire 函数,这是个虚函数,由于传入的 backupBackend 是 CPUBackend(继承 Backend),所以实际调用的是 CPUBackend::onAcquire,其具体实现代码如下:
// source/backend/cpu/CPUBackend.cpp Backend::MemObj* CPUBackend::onAcquire(const MNN::Tensor* nativeTensorConst, StorageType storageType) { if (nativeTensorConst == nullptr) { return nullptr; } //FUNC_PRINT_ALL(nativeTensorConst, p); auto nativeTensor = (Tensor*)nativeTensorConst; auto size = getTensorSize(nativeTensor, true); return allocBuffer(size, nativeTensor, storageType); }
1.1.1.1.1.3 Backend::onResizeBegin
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用
Backend::onResizeBegin 函数的代码如下:
backupBackend->onResizeBegin(); auto code = exe->onResize(c.inputs, c.outputs); // ... code = backupBackend->onResizeEnd(); // ... code = exe->onExecute(c.inputs, c.outputs); // ...
onResizeBegin 函数是个虚函数,由于传入的 backupBackend 是 CPUBackend(继承Backend),所以实际调用的是 CPUBackend::onResizeBegin,其具体实现代码如下:
// source/backend/cpu/CPUBackend.cpp void CPUBackend::onResizeBegin() { mDynamicAllocator->reset(); }
1.1.1.1.1.4 Execution::onResize
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用
Execution::onResize 函数的代码如下:
backupBackend->onResizeBegin(); auto code = exe->onResize(c.inputs, c.outputs); // ... code = backupBackend->onResizeEnd(); // ... code = exe->onExecute(c.inputs, c.outputs); // ...
onResize 函数是个虚函数, exe 创建逻辑见 CPUBackend::onCreate ,exe->onResize 调用是个多态,其基类为 Execution,我们选择一个实例 CPULoop 进行分析,其具体实现代码如下:
// source/backend/cpu/CPURaster.cpp class CPULoop : public Execution { virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override { int inputIndexSize = mLoop->inputIndexes()->size(); MNN_ASSERT(inputIndexSize == inputs.size()); for (int i=0; i<inputIndexSize; ++i) { mStack[mLoop->inputIndexes()->data()[i]] = inputs[i]; } int outputIndexSize = mLoop->outputIndexes()->size(); MNN_ASSERT(outputIndexSize == outputs.size()); for (int i=0; i<outputIndexSize; ++i) { mStack[mLoop->outputIndexes()->data()[i]] = outputs[i]; } int numberThread = mLoop->parallel() ? static_cast<CPUBackend*>(backend())->threadNumber() : 1; mMaxCacheSize = 0; auto bytes = static_cast<CPUBackend*>(backend())->functions()->bytes; mMaxFuseBufferSize = 0; for (int i=0; i<mLoop->commands()->size(); ++i) { auto cmd = mLoop->commands()->GetAs<RegionCommand>(i); auto op = cmd->op(); if (cmd->fuse() >= 0) { // Make Temp output buffer auto size = cmd->size()->data(); if (cmd->op()->type() == OpType_MatMul) { mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[2]); } else { mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[1] * size[2]); } } if (OpType_UnaryOp == op->type()) { if (nullptr != op->main_as_UnaryOp()) { auto view0 = cmd->view()->GetAs<View>(0); auto view1 = cmd->view()->GetAs<View>(1); MNN_ASSERT(view0->stride()->data()[2] == 1 || cmd->fuse() >= 0); if (view1->stride()->data()[2] != 1) { mMaxCacheSize = std::max(mMaxCacheSize, cmd->size()->data()[2] * bytes); } } continue; } if (OpType_BinaryOp == op->type()) { auto view0 = cmd->view()->GetAs<View>(0); auto view1 = cmd->view()->GetAs<View>(1); auto view2 = cmd->view()->GetAs<View>(2); MNN_ASSERT(view0->stride()->data()[2] == 1 || cmd->fuse() >= 0); if (view1->stride()->data()[2] != 1 || view2->stride()->data()[2] != 1) { mMaxCacheSize = std::max(mMaxCacheSize, 2 * cmd->size()->data()[2] * bytes); } continue; } if (OpType_MatMul == op->type()) { bool transposeC = true; int e = cmd->size()->data()[0]; int l = cmd->size()->data()[1]; int h = cmd->size()->data()[2]; std::shared_ptr<Tensor> A, B, C, Bias; C.reset(Tensor::createDevice<float>({e, h})); if (op->main_as_MatMul()->transposeA()) { A.reset(Tensor::createDevice<float>({l, e})); } else { A.reset(Tensor::createDevice<float>({e, l})); } if (op->main_as_MatMul()->transposeB()) { B.reset(Tensor::createDevice<float>({h, l})); } else { B.reset(Tensor::createDevice<float>({l, h})); } auto view = cmd->view()->GetAs<View>(0); if (view->stride()->data()[0] == 1) { transposeC = false; } std::vector<Tensor*> inputs, outputs; if (cmd->indexes()->size() > 3) { Bias.reset(Tensor::createDevice<float>({h})); inputs = {A.get(), B.get(), Bias.get()}; } else { inputs = {A.get(), B.get()}; } outputs = {C.get()}; auto bufferPool = static_cast<CPUBackend*>(backend())->getBufferAllocator(); auto code = NO_ERROR; if (numberThread > 1) { bufferPool->barrierBegin(); } for (int v=0; v<numberThread; ++v) { if (numberThread > 1) { bufferPool->beginGroup(); } do { // If not loop parallel, parallel inside bool needParallel = numberThread == 1; mContainer[v].exe[i].reset(new CPUMatMul(backend(), op->main_as_MatMul()->transposeA(), op->main_as_MatMul()->transposeB(), transposeC, needParallel)); if (nullptr == mContainer[v].exe[i]) { code = OUT_OF_MEMORY; break; } code = mContainer[v].exe[i]->onResize(inputs, outputs); } while (false); if (numberThread > 1) { bufferPool->endGroup(); } if (NO_ERROR != code) { break; } } if (numberThread > 1) { bufferPool->barrierEnd(); } if (NO_ERROR != code) { return code; } continue; } } auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber(); if (mMaxCacheSize > 0 || mMaxFuseBufferSize > 0) { mCacheBuffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize)); if (mCacheBuffer.invalid()) { return OUT_OF_MEMORY; } mFuseBuffer = mCacheBuffer + threadNumber * mMaxCacheSize; static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(mCacheBuffer); } return NO_ERROR; } }
1.1.1.1.1.5 Backend::onResizeEnd
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用
Backend::onResizeEnd 函数的代码如下:
backupBackend->onResizeBegin(); auto code = exe->onResize(c.inputs, c.outputs); // ... code = backupBackend->onResizeEnd(); // ... code = exe->onExecute(c.inputs, c.outputs); // ...
onResizeEnd 函数是个虚函数,由于传入的 backupBackend 是 CPUBackend(继承 Backend),所以实际调用的是 CPUBackend::onResizeEnd,其具体实现代码如下:
// source/backend/cpu/CPUBackend.cpp ErrorCode CPUBackend::onResizeEnd() { getCache()->release(); return mDynamicAllocator->compute(); }
1.1.1.1.1.6 Execution::onExecute
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用
Execution::onExecute 函数的代码如下:
backupBackend->onResizeBegin(); auto code = exe->onResize(c.inputs, c.outputs); // ... code = backupBackend->onResizeEnd(); // ... code = exe->onExecute(c.inputs, c.outputs); // ...
onExecute 函数是个虚函数, exe 创建逻辑见 CPUBackend::onCreate ,exe->onExecute 调用是个多态,其基类为 Execution,我们选择一个实例 CPULoop 进行分析,其具体实现代码如下:
// source/backend/cpu/CPURaster.cpp class CPULoop : public Execution { virtual ErrorCode onExecute(const std::vector<Tensor *> &originInputs, const std::vector<Tensor *> &originOutputs) override { auto cpubackend = static_cast<CPUBackend*>(backend()); auto precision = cpubackend->precisionMode(); auto threadNumber = cpubackend->threadNumber(); if (mLoop->initCommand() != nullptr) { for (int i=0; i<mLoop->initCommand()->size(); ++i) { auto cmd = mLoop->initCommand()->GetAs<RegionCommand>(i); if (cmd->op() == nullptr) { auto output = mStack[cmd->indexes()->data()[0]]; ::memset(output->host<void>(), 0, cpubackend->getTensorSize(output) * cpubackend->functions()->bytes); } else { Tensor::InsideDescribe::Region reg; auto srcView = cmd->view()->GetAs<View>(1); auto dstView = cmd->view()->GetAs<View>(0); ::memcpy(reg.size, cmd->size()->data(), 3 * sizeof(int32_t)); ::memcpy(reg.src.stride, srcView->stride()->data(), 3 * sizeof(int32_t)); ::memcpy(reg.dst.stride, dstView->stride()->data(), 3 * sizeof(int32_t)); auto input = mStack[cmd->indexes()->data()[1]]; auto inputSize = input->elementSize(); auto output = mStack[cmd->indexes()->data()[0]]; auto bytes = input->getType().bytes(); if (halide_type_float == input->getType().code) { bytes = cpubackend->functions()->bytes; } _blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>()); } } } if (1 == mLoop->commands()->size()) { auto cmd = mLoop->commands()->GetAs<RegionCommand>(0); auto op = cmd->op(); if (OpType_UnaryOp == op->type() && nullptr == op->main() && cmd->fuse() < 0) { // For Gather / Single Unary auto index0 = cmd->iterIndexes()->data()[0]; auto index1 = cmd->iterIndexes()->data()[1]; int32_t iter = 0; int32_t* iter0 = &iter; int32_t* iter1 = &iter; int32_t iter0Stride = 0; int32_t iter1Stride = 0; if (index0 >= 0) { iter0 = originInputs[index0]->host<int32_t>(); iter0Stride = 1; } if (index1 >= 0) { iter1 = originInputs[index1]->host<int32_t>(); iter1Stride = 1; } Tensor::InsideDescribe::Region reg; auto srcView = cmd->view()->GetAs<View>(1); auto dstView = cmd->view()->GetAs<View>(0); ::memcpy(reg.size, cmd->size()->data(), 3 * sizeof(int32_t)); ::memcpy(reg.src.stride, srcView->stride()->data(), 3 * sizeof(int32_t)); ::memcpy(reg.dst.stride, dstView->stride()->data(), 3 * sizeof(int32_t)); auto input = mStack[cmd->indexes()->data()[1]]; auto inputSize = input->elementSize(); auto output = mStack[cmd->indexes()->data()[0]]; auto bytes = input->getType().bytes(); if (halide_type_float == input->getType().code) { bytes = static_cast<CPUBackend*>(backend())->functions()->bytes; } auto step0 = cmd->steps()->data()[0]; auto step1 = cmd->steps()->data()[1]; auto loopNumber = mLoop->loopNumber(); for (; iter<loopNumber; ++iter) { auto srcIter = *(iter1 + iter1Stride * iter); auto dstIter = *(iter0 + iter0Stride * iter); auto srcOffset = srcIter * step1 + srcView->offset(); auto dstOffset = dstIter * step0 + dstView->offset(); if (dstOffset >= 0) { if (srcOffset >= 0 && srcOffset < inputSize) { _blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset); } else { _zero(reg, bytes, output->host<uint8_t>() + bytes * dstOffset); } } } return NO_ERROR; } } auto bytes = static_cast<CPUBackend*>(backend())->functions()->bytes; auto func = [&](int iter, int tId) { int fuseOutputStride[3]; const int32_t* outputStride = nullptr; auto fuseBuffer = mFuseBuffer + mMaxFuseBufferSize * tId; for (int index=0; index<mLoop->commands()->size(); ++index) { auto cmd = mLoop->commands()->GetAs<RegionCommand>(index); auto blit = _selectUnitProc(bytes, cmd->view()->GetAs<View>(1)->stride()->data()[2], 1); auto op = cmd->op(); int iterIndexsize = cmd->iterIndexes()->size(); if (cmd->fuse() >= 0) { outputStride = fuseOutputStride; auto cmdSize = cmd->size()->data(); fuseOutputStride[0] = cmdSize[1] * cmdSize[2]; fuseOutputStride[1] = cmdSize[2]; fuseOutputStride[2] = 1; } else { // Loop Op's command's first index must be output outputStride = cmd->view()->GetAs<View>(0)->stride()->data(); } halide_type_t inputType; for (int v=0; v<iterIndexsize; ++v) { auto tensorIndex = cmd->indexes()->data()[v]; auto tensor = mStack[tensorIndex]; auto iterIndex = cmd->iterIndexes()->data()[v]; auto offset = iter; if (1 == v) { inputType = tensor->getType(); } if (iterIndex >= 0) { offset = mStack[iterIndex]->host<int32_t>()[iter]; } auto view = cmd->view()->GetAs<View>(v); offset = offset * cmd->steps()->data()[v] + view->offset(); mContainer[tId].stackPtr[tensorIndex] = tensor->host<uint8_t>() + offset * bytes; MNN_ASSERT(nullptr != tensor->host<uint8_t>()); } auto dstOrigin = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[0]]; auto dst = dstOrigin; if (cmd->fuse() >= 0) { dst = fuseBuffer.ptr(); } do { if (OpType_UnaryOp == op->type()) { auto src = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[1]]; if (nullptr == op->main()) { // Copy Tensor::InsideDescribe::Region reg; auto srcView = cmd->view()->GetAs<View>(1); auto dstView = cmd->view()->GetAs<View>(0); ::memcpy(reg.size, cmd->size()->data(), 3 * sizeof(int32_t)); ::memcpy(reg.src.stride, srcView->stride()->data(), 3 * sizeof(int32_t)); ::memcpy(reg.dst.stride, outputStride, 3 * sizeof(int32_t)); auto step0 = cmd->steps()->data()[0]; auto step1 = cmd->steps()->data()[1]; auto loopNumber = mLoop->loopNumber(); _blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst); break; } auto proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectUnaryFunctionForFloat(op->main_as_UnaryOp()->opType(), static_cast<CPUBackend*>(backend())->precisionMode()); auto lastS = cmd->size()->data()[2]; if (lastS == 1 || cmd->view()->GetAs<View>(1)->stride()->data()[2] == 1) { for (int z=0; z<cmd->size()->data()[0]; ++z) { auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes; auto dstZ = dst + z * outputStride[0] * bytes; for (int y=0; y<cmd->size()->data()[1]; ++y) { auto srcY = srcZ + y * cmd->view()->GetAs<View>(1)->stride()->data()[1] * bytes; auto dstY = dstZ + y * outputStride[1] * bytes; proc(dstY, srcY, lastS); } } } else { // Blit to cache auto srcCache = mCacheBuffer.ptr() + mMaxCacheSize * tId; for (int z=0; z<cmd->size()->data()[0]; ++z) { auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes; auto dstZ = dst + z * outputStride[0] * bytes; for (int y=0; y<cmd->size()->data()[1]; ++y) { auto srcY = srcZ + y * cmd->view()->GetAs<View>(1)->stride()->data()[1] * bytes; auto dstY = dstZ + y * outputStride[1] * bytes; blit(srcCache, srcY, lastS, cmd->view()->GetAs<View>(1)->stride()->data()[2], 1); proc(dstY, srcCache, lastS); } } } continue; } if (OpType_MatMul == op->type()) { // TODO: Don't support fuse for matmul currently const float* APtr = nullptr; const float* BPtr = nullptr; const float* BiasPtr = nullptr; float* CPtr = (float*)dst; auto exe = static_cast<CPUMatMul*>(mContainer[tId].exe[index].get()); APtr = (const float*)mContainer[tId].stackPtr[cmd->indexes()->data()[1]]; BPtr = (const float*)mContainer[tId].stackPtr[cmd->indexes()->data()[2]]; if (iterIndexsize > 3) { BiasPtr = (const float*)mContainer[tId].stackPtr[cmd->indexes()->data()[3]]; } exe->execute(APtr, BPtr, CPtr, BiasPtr); break; } if (OpType_BinaryOp == op->type()) { auto src0 = mContainer[tId].stackPtr[cmd->indexes()->data()[1]]; MNNBinaryExecute proc; if (inputType.code == halide_type_float) { proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectBinaryFunctionForFloat(op->main_as_BinaryOp()->opType()); } else { MNN_ASSERT(inputType.code == halide_type_int); proc = CPUBinary::selectForInt(op->main_as_BinaryOp()->opType()); } auto lastS = cmd->size()->data()[2]; auto stride0 = outputStride; auto stride1 = cmd->view()->GetAs<View>(1)->stride()->data(); MNN_ASSERT(stride0[2] == 1); auto src1 = mContainer[tId].stackPtr[cmd->indexes()->data()[2]]; auto stride2 = cmd->view()->GetAs<View>(2)->stride()->data(); auto blit1 = _selectUnitProc(bytes, stride1[2], 1); auto blit2 = _selectUnitProc(bytes, stride2[2], 1); if (cmd->size()->data()[2] == 1 || (stride1[2] == 1 && stride2[2] == 1)) { for (int z=0; z<cmd->size()->data()[0]; ++z) { auto src0Z = src0 + z * stride1[0] * bytes; auto src1Z = src1 + z * stride2[0] * bytes; auto dstZ = dst + z * stride0[0] * bytes; for (int y=0; y<cmd->size()->data()[1]; ++y) { auto src0Y = src0Z + y * stride1[1] * bytes; auto src1Y = src1Z + y * stride2[1] * bytes; auto dstY = dstZ + y * stride0[1] * bytes; proc(dstY, src0Y, src1Y, cmd->size()->data()[2], -1); } } } else { auto cache0 = mCacheBuffer.ptr() + mMaxCacheSize * tId; auto cache1 = cache0 + cmd->size()->data()[2] * bytes; for (int z=0; z<cmd->size()->data()[0]; ++z) { auto src0Z = src0 + z * stride1[0] * bytes; auto src1Z = src1 + z * stride2[0] * bytes; auto dstZ = dst + z * stride0[0] * bytes; for (int y=0; y<cmd->size()->data()[1]; ++y) { auto src0Y = src0Z + y * stride1[1] * bytes; auto src1Y = src1Z + y * stride2[1] * bytes; auto dstY = dstZ + y * stride0[1] * bytes; blit1(cache0, src0Y, cmd->size()->data()[2], stride1[2], 1); blit2(cache1, src1Y, cmd->size()->data()[2], stride2[2], 1); proc(dstY, cache0, cache1, cmd->size()->data()[2], -1); } } } break; } } while(false); if (dst != dstOrigin) { MNN_ASSERT(bytes == 4); // Currently only support add and float32 auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data(); auto srcF = (const float*)dst; auto dstF = (float*)dstOrigin; int sizeZ = cmd->size()->data()[0]; int sizeY = cmd->size()->data()[1]; int sizeX = cmd->size()->data()[2]; if (cmd->op()->type() == OpType_MatMul) { auto proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectBinaryFunctionForFloat(cmd->fuse()); proc(dstF, dstF, srcF, sizeZ * sizeX, -1); continue; } switch (cmd->fuse()) { case BinaryOpOperation_ADD: for (int z=0; z<sizeZ; ++z) { auto srcZ = srcF + z * outputStride[0]; auto dstZ = dstF + z * dstStride[0]; for (int y=0; y<sizeY; ++y) { auto srcY = srcZ + y * outputStride[1]; auto dstY = dstZ + y * dstStride[1]; for (int x=0; x<sizeX; ++x) { auto dstOffset = x * dstStride[2]; dstY[dstOffset] = dstY[dstOffset] + srcY[x]; } } } break; case BinaryOpOperation_MUL: for (int z=0; z<sizeZ; ++z) { auto srcZ = srcF + z * dstStride[0]; auto dstZ = dstF + z * outputStride[0]; for (int y=0; y<sizeY; ++y) { auto srcY = srcZ + z * dstStride[1]; auto dstY = dstZ + z * outputStride[1]; for (int x=0; x<sizeX; ++x) { auto dstOffset = x * dstStride[2]; dstY[dstOffset] = dstY[dstOffset] * srcY[x]; } } } break; case BinaryOpOperation_SUB: for (int z=0; z<sizeZ; ++z) { auto srcZ = srcF + z * dstStride[0]; auto dstZ = dstF + z * outputStride[0]; for (int y=0; y<sizeY; ++y) { auto srcY = srcZ + z * dstStride[1]; auto dstY = dstZ + z * outputStride[1]; for (int x=0; x<sizeX; ++x) { auto dstOffset = x * dstStride[2]; auto D = dstY[dstOffset]; auto S = srcY[x]; dstY[dstOffset] = D - S; } } } break; default: break; } } } }; if (mLoop->parallel()) { MNN_CONCURRENCY_BEGIN(tId, threadNumber) { for (int iter=tId; iter < mLoop->loopNumber(); iter+=threadNumber) { func(iter, tId); } } MNN_CONCURRENCY_END(); } else { for (int iter=0; iter < mLoop->loopNumber(); ++iter) { func(iter, 0); } } return NO_ERROR; } }