1、createSession
依据 ScheduleConfig 和 RuntimeInfo 创建会话。
// source/core/Interpreter.cpp Session* Interpreter::createSession(const ScheduleConfig& config, const RuntimeInfo& runtime) { return createMultiPathSession({config}, runtime); }
1.1 createMultiPathSession
// source/core/Interpreter.cpp Session* Interpreter::createMultiPathSession(const std::vector<ScheduleConfig>& configs, const RuntimeInfo& runtime) { // ... auto result = newSession.get(); auto validForResize = info.validForResize; if (validForResize && mNet->modes.inputMode == Session_Input_Inside && mNet->modes.resizeMode == Session_Resize_Direct) { result->resize(); } // ... return result; }
1.1.1 Session::resize
// source/core/Session.cpp ErrorCode Session::resize() { // ... if (mNeedMalloc) { // Set needResize = true for easy for judge in runSession when error mNeedResize = true; // Turn Pipeline to Command Buffer and Malloc resource // TODO: Separate Schedule and Malloc bool forbidReplace = permitCodegen; if (mInfo.constReplaceBackend != nullptr) { forbidReplace = true; } for (auto& iter : mPipelines) { auto error = iter->allocMemory(firstMalloc, forbidReplace); if (NO_ERROR != error) { return error; } } // ... mNeedMalloc = false; mNeedResize = false; } // ... return NO_ERROR; }
1.1.1.1 Pipeline::allocMemory
OpCacheInfo 、BackendCache、Command、CommandBuffer
// source/core/Pipeline.cpp // typedef std::pair<BackendCache, std::vector<OpCacheInfo>> PipelineInfo; // // struct BackendCache { // Backend::Info info; // BackendConfig config; // std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>> cache; // bool needComputeShape = true; // bool needComputeGeometry = true; // bool reportError = true; // std::map<Tensor*, TENSORCACHE> inputTensorCopyCache; // }; // // /** pipeline info */ // struct OpCacheInfo { // /** op */ // const Op* op; // /** input tensors */ // std::vector<Tensor*> inputs; // /** output tensors */ // std::vector<Tensor*> outputs; // /** schedule type*/ // Schedule::Type type = Schedule::Type::SEPARATE; // // /**Command buffer for cache*/ // CommandBuffer cacheBuffer; // // /**Command buffer for execute*/ // CommandBuffer executeBuffer; // // std::map<const Op*, std::shared_ptr<Execution>> executionCache; // }; // // struct Command : public RefCount { // const Op* op; // std::vector<Tensor*> workInputs; // std::vector<Tensor*> workOutputs; // std::vector<Tensor*> inputs; // std::vector<Tensor*> outputs; // std::shared_ptr<BufferStorage> buffer; // std::shared_ptr<Execution> execution; // std::shared_ptr<OperatorInfo> info; // #ifdef MNN_BUILD_CODEGEN // bool canVectorize = false; // #endif // }; // // struct CommandBuffer { // std::vector<SharedPtr<Command>> command; // std::vector<std::shared_ptr<Tensor>> extras; // bool hasWrap = false; // }; ErrorCode Pipeline::allocMemory(bool firstMalloc, bool forbidReplace) { // MNN_PRINT("allocMemory mtype:%d, cpubackendType:%d, cpuBackend runtime:%p\n", mBackend->type(), mBackupBackend->type(), mBackupBackend->getRuntime()); if (!firstMalloc) { // For session setNeedMalloc, if session's output is set as some input, It may cause error // Dup des to avoid it for (auto& info : mInfo.second) { auto& buffer = info.executeBuffer; for (const auto& infoP : buffer.command) { auto& info = *infoP; for (auto t : info.workOutputs) { if (!TensorUtils::getDescribe(t)->isMutable) { continue; } auto des = TensorUtils::getDescribe(t); auto usage = des->usage; if (TensorUtils::getDescribeOrigin(t)->mContent->count() > 1) { TensorUtils::getDescribeOrigin(t)->mContent = new Tensor::InsideDescribe::NativeInsideDescribe; auto dstDes = TensorUtils::getDescribe(t); t->buffer().dim = dstDes->dims; ::memcpy(t->buffer().dim, des->dims, MNN_MAX_TENSOR_DIM * sizeof(halide_dimension_t)); dstDes->dimensionFormat = des->dimensionFormat; dstDes->usage = usage; dstDes->regions = des->regions; dstDes->quantAttr = des->quantAttr; dstDes->tensorArrayAttr = des->tensorArrayAttr; } } } } } // mInfo 类型为 typedef std::pair<BackendCache, std::vector<OpCacheInfo>> PipelineInfo; // 开始创建 Execution /* Create Execution Begin */ // mInfo.first 类型为 BackendCache // mInfo.first.cache.first 类型为 std::shared_ptr<Backend>,即主动创建的 VulkanBackend auto& mBackend = mInfo.first.cache.first; // mInfo.first.cache.second 类型为 std::shared_ptr<Backend>,即后备 CPUBackend auto& mBackupBackend = mInfo.first.cache.second; mBackend->onClearBuffer(); mBackupBackend->onClearBuffer(); // Check If we need a lone time for init if (mBackend->type() != MNN_FORWARD_CPU && mBackend->type() != MNN_FORWARD_CPU_EXTENSION && mTuneAttr.autoSetOpType) { Runtime::OpInfo dstInfo; int currentInitCount = 0; std::vector<Schedule::OpCacheInfo> initInfos; for (auto& info : mInfo.second) { auto& buffer = info.executeBuffer; for (auto& iterP : buffer.command) { auto& iter = *iterP; dstInfo.initCostLong = false; mRuntime->onMeasure(iter.inputs, iter.outputs, iter.op, dstInfo); if (dstInfo.initCostLong) { initInfos.emplace_back(info); currentInitCount++; break; } } if (currentInitCount >= mTuneAttr.maxTuningNumber) { break; } } if (currentInitCount > 0) { MNN_PRINT("Turn back to cpu\n"); // Reset execution for (auto& info : mInfo.second) { info.executionCache.clear(); for (auto& iterP : info.executeBuffer.command) { iterP->execution = nullptr; iterP->execution = nullptr; _recycleDynamicMemory(iterP.get()); } } if (!mRuntime->hasAsyncWork()) { _pushTuningTask(std::move(initInfos)); } mBackend.reset(mCpuRuntime->onCreate(nullptr)); } } { // 创建 Execution auto code = _createExecutions(mInfo); if (NO_ERROR != code) { return code; } } /* Create Execution End */ _SetTensorBackend(mInfo, mAllocInput); // Insert Wrap If needed { auto insertCode = _InsertCopy(mInfo, mCacheConstTensors, mShapeFixConstCache, mAllocInput, forbidReplace); if (NO_ERROR != insertCode) { return insertCode; } } /* Insert Wrap End*/ // Compute RefCount Begin for (auto& info : mInfo.second) { auto& buffer = info.executeBuffer; // MNN_PRINT("before resize, mInfo.second size:%lu, command size:%lu,op type:%s, op name:%s\n", mInfo.second.size(), buffer.command.size(), EnumNameOpType(info.op->type()), info.op->name()->c_str()); for (auto& iterP : buffer.command) { auto& iter = *iterP; for (auto t : iter.workInputs) { auto des = TensorUtils::getDescribe(t); if (des->usage != Tensor::InsideDescribe::CONSTANT) { des->useCount = 0; } } } } for (auto& info : mInfo.second) { auto& buffer = info.executeBuffer; for (auto& iterP : buffer.command) { auto& iter = *iterP; for (auto t : iter.workInputs) { auto des = TensorUtils::getDescribe(t); if (des->usage != Tensor::InsideDescribe::CONSTANT) { des->useCount += 1; } } } } // Compute RefCount End // Alloc tensor mBackend->onResizeBegin(); mBackupBackend->onResizeBegin(); // mInfo.first 类型为 BackendCache // mInfo.second 类型为 std::vector<OpCacheInfo> for (auto& info : mInfo.second) { // info.executeBuffer 类型为 CommandBuffer auto& buffer = info.executeBuffer; // buffer.command 类型为 std::vector<SharedPtr<Command>> for (int cmdIndex=0; cmdIndex < buffer.command.size(); ++cmdIndex) { auto& iterP = buffer.command[cmdIndex]; auto& iter = *iterP; #ifdef MNN_PIPELINE_DEBUG auto memory = const_cast<Runtime*>(mRuntime)->onGetMemoryInMB(); if (nullptr != info.op->name()) { MNN_PRINT("%f, before Resize: %s - %d\n", memory, info.op->name()->c_str(), cmdIndex); } #endif // MNN_PRINT("before Resize: optype:%s, name:%s, input0:%p, output0:%p, mAllocInput:%d\n", EnumNameOpType(iter.op->type()), iter.info->name().c_str(), iter.inputs[0], iter.outputs[0], mAllocInput); // Alloc for Tensors // iter 类型为 Command // iter.execution 类型为 std::shared_ptr<Execution> auto curBackend = iter.execution->backend(); if (mAllocInput) { for (auto t : iter.workInputs) { auto allocRes = _allocTensor(t, curBackend, mOutputStatic); if (!allocRes) { return OUT_OF_MEMORY; } } } { for (auto t : iter.workOutputs) { auto res = _allocTensor(t, curBackend, mOutputStatic); if (!res) { return OUT_OF_MEMORY; } } } #ifdef MNN_PIPELINE_DEBUG if (iter.info != nullptr) { MNN_PRINT("before Resize 2, calling: %s - %d \n", iter.info->name().c_str(), cmdIndex); } #endif // iter.execution 类型为 std::shared_ptr<Execution> auto code = iter.execution->onResize(iter.workInputs, iter.workOutputs); if (NO_ERROR != code) { #ifdef MNN_PIPELINE_DEBUG MNN_ERROR("Pipeline Resize error: %d\n", code); #endif if (iter.info.get()) { MNN_ERROR("Resize error for type = %s, name = %s \n", iter.info->type().c_str(), iter.info->name().c_str()); } return code; } // Free mid tensor for (auto t : iter.workInputs) { _releaseTensor(t, mAllocInput); } } } // Recycle All Dynamic Tensor for (auto& info : mInfo.second) { auto& buffer = info.executeBuffer; for (auto& c : buffer.command) { _recycleDynamicMemory(c.get()); } } auto code = mBackend->onResizeEnd(); if (code != NO_ERROR) { return code; } code = mBackupBackend->onResizeEnd(); return code; }
1.1.1.1.1 _createExecutions
OpCacheInfo 、BackendCache、Command、CommandBuffer
// source/core/Pipeline.cpp // typedef std::pair<BackendCache, std::vector<OpCacheInfo>> PipelineInfo; // // struct BackendCache { // Backend::Info info; // BackendConfig config; // std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>> cache; // bool needComputeShape = true; // bool needComputeGeometry = true; // bool reportError = true; // std::map<Tensor*, TENSORCACHE> inputTensorCopyCache; // }; // // /** pipeline info */ // struct OpCacheInfo { // /** op */ // const Op* op; // /** input tensors */ // std::vector<Tensor*> inputs; // /** output tensors */ // std::vector<Tensor*> outputs; // /** schedule type*/ // Schedule::Type type = Schedule::Type::SEPARATE; // // /**Command buffer for cache*/ // CommandBuffer cacheBuffer; // // /**Command buffer for execute*/ // CommandBuffer executeBuffer; // // std::map<const Op*, std::shared_ptr<Execution>> executionCache; // }; // // struct Command : public RefCount { // const Op* op; // std::vector<Tensor*> workInputs; // std::vector<Tensor*> workOutputs; // std::vector<Tensor*> inputs; // std::vector<Tensor*> outputs; // std::shared_ptr<BufferStorage> buffer; // std::shared_ptr<Execution> execution; // std::shared_ptr<OperatorInfo> info; // #ifdef MNN_BUILD_CODEGEN // bool canVectorize = false; // #endif // }; // // struct CommandBuffer { // std::vector<SharedPtr<Command>> command; // std::vector<std::shared_ptr<Tensor>> extras; // bool hasWrap = false; // }; static ErrorCode _createExecutions(Schedule::PipelineInfo& mInfo) { // mInfo.first 类型为 BackendCache // mInfo.first.cache.first 类型为 std::shared_ptr<Backend>,即主动创建的 VulkanBackend auto& mBackend = mInfo.first.cache.first; // mInfo.first.cache.second 类型为 std::shared_ptr<Backend>,即后备 CPUBackend auto& mBackupBackend = mInfo.first.cache.second; // mInfo.second 类型为 std::vector<OpCacheInfo> for (auto& info : mInfo.second) { auto& buffer = info.executeBuffer; // MNN_PRINT("before resize, mInfo.second size:%lu, command size:%lu,op type:%s, op name:%s\n", mInfo.second.size(), buffer.command.size(), EnumNameOpType(info.op->type()), info.op->name()->c_str()); // buffer 类型为 CommandBuffer // buffer.command 类型为 std::vector<SharedPtr<Command>> for (auto& iterP : buffer.command) { auto& iter = *iterP; // Create exe // Find Cache // 先从缓存中找,没有则创建 execution bool cached = false; if (nullptr == iter.execution) { /** Cache origin execution for fast resize*/ auto exeIter = info.executionCache.find(iter.op); if (exeIter != info.executionCache.end()) { iter.execution = exeIter->second; cached = true; } } if (nullptr == iter.execution) { // 先使用指定的 Backend(如 CPUBackend )创建 // iter 类型为 Command // iter.execution 类型为 std::shared_ptr<Execution> execution; iter.execution.reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.op)); } if (nullptr == iter.execution) { // Try Backup // 没创建成功则使用后备 Backend(如 VulkanBackend )创建 iter.execution.reset(mBackupBackend->onCreate(iter.inputs, iter.outputs, iter.op)); if (nullptr == iter.execution) { if (mInfo.first.reportError) { MNN_ERROR("Create execution error : %d\n", iter.op->type()); } return NOT_SUPPORT; } } // invalid means memory alloc failed if (!iter.execution->valid()) { iter.execution = nullptr; iter.execution = nullptr; return OUT_OF_MEMORY; } if ((!cached) && iter.buffer == nullptr && (iter.op->type() != OpType_Raster) && (iter.op->type() != OpType_BinaryOp)) { // info 类型为 OpCacheInfo // info.executionCache 类型为 std::map<const Op*, std::shared_ptr<Execution>> info.executionCache.insert(std::make_pair(iter.op, iter.execution)); } } } return NO_ERROR; }
1.1.1.1.1.1 VulkanBackend::onCreate
在函数 _createExecutions 中调用 VulkanBackend::onCreate 函数的代码如下:
iter.execution.reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.op));
由于 mBackend 是 VulkanBackend(继承 Backend),所以实际调用的是 VulkanBackend::onCreate,其具体实现代码如下:
// source/backend/vulkan/image/backend/VulkanBackend.cpp Execution* VulkanBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) { auto creator = getCreatorMap(); auto iter = creator->find(op->type()); std::string name = ""; if (nullptr != op->name()) { name = op->name()->str(); } if (iter == creator->end()) { #ifdef MNN_OP_SUPPORT_LOG MNN_PRINT("Vulkan don't support %d, %s: %s\n", op->type(), EnumNameOpType(op->type()), name.c_str()); #endif return nullptr; } bool valid = true; for (int i=0; i<inputs.size(); ++i) { if (!OpCommonUtils::opNeedContent(op, i)) { continue; } auto t = inputs[i]; auto inputDes = TensorUtils::getDescribe(t); if (inputDes->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL) { for (auto& r : inputDes->regions) { if (!_supportImageSize(r.origin)) { valid = false; break; } } if (!valid) { break; } } else { if (!_supportImageSize(t)) { valid = false; break; } } } for (auto t : outputs) { if (!_supportImageSize(t)) { valid = false; break; } } if (!valid) { #ifdef MNN_OP_SUPPORT_LOG MNN_ERROR("Vulkan don't support for %s, type=%s, Tensor not support\n", name.c_str(), EnumNameOpType(op->type())); #endif return nullptr; } // iter->second 类型为 MNN::VulkanBackend::Creator *,即调用具体的创建算子计算实现 auto originExecution = (VulkanBasicExecution*)iter->second->onCreate(inputs, outputs, op, this); if (nullptr == originExecution) { #ifdef MNN_OP_SUPPORT_LOG MNN_ERROR("Vulkan don't support for %s, type=%s, Special case\n", name.c_str(), EnumNameOpType(op->type())); #endif return nullptr; } if (mDirect) { return new VulkanBasicExecutionDirect(std::shared_ptr<VulkanBasicExecution>(originExecution)); } return new VulkanBasicExecutionInDirect(std::shared_ptr<VulkanBasicExecution>(originExecution)); }
1.1.1.1.1.1.1 VulkanBackend::Creator::onCreate
在函数 VulkanBackend::onCreate 中调用 VulkanBackend::Creator::onCreate 函数的代码如下:
auto creator = getCreatorMap(); auto iter = creator->find(op->type()); // iter->second 类型为 MNN::VulkanBackend::Creator *,即调用具体的创建算子计算实现 auto originExecution = (VulkanBasicExecution*)iter->second->onCreate(inputs, outputs, op, this);
备注:iter->second->onCreate 调用是个多态,实际运行中根据算子类型 opType ,调用不同的子类。其基类为 VulkanBackend::Creator 。
其中一个实现类为 VulkanConvolutionCreator ,具体实现代码如下:
// source/backend/vulkan/image/execution/VulkanConvolution.cpp class VulkanConvolutionCreator : public VulkanBackend::Creator { public: virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* backend) const override { auto extra = static_cast<VulkanBackend *>(backend); auto convReal = op->main_as_Convolution2D(); auto common = convReal->common(); auto outputCount = common->outputCount(); const int fh = common->kernelY(); const int fw = common->kernelX(); int srcCount = 0; const float* source = nullptr; const float* biasPtr = nullptr; int weightSize = 0; std::shared_ptr<ConvolutionCommon::Int8Common> quanWeight; if (nullptr != op->main_as_Convolution2D()->quanParameter()) { auto quan = op->main_as_Convolution2D()->quanParameter(); if (1 == quan->type() || 2 == quan->type()) { if (quan->has_scaleInt()) { // Don't support IDST-int8 because of error return nullptr; } } quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D(), backend, true); srcCount = quanWeight->weightFloat.size() / (outputCount * fh * fw); source = quanWeight->weightFloat.get(); weightSize = quanWeight->weightFloat.size(); } else { if (nullptr != convReal->weight()) { srcCount = convReal->weight()->size() / (outputCount * fh * fw); source = convReal->weight()->data(); weightSize = convReal->weight()->size(); } else { srcCount = convReal->common()->inputCount(); } } if (nullptr != convReal->bias()) { biasPtr = convReal->bias()->data(); } if (op->type() == OpType_Convolution) { if (inputs.size() > 1) { return nullptr; } auto convCommonParam = op->main_as_Convolution2D()->common(); const int group = convCommonParam->group(); if (1 == group) { return VulkanConvolutionImpl::create(extra, common, inputs, outputs[0], source, biasPtr, srcCount, outputCount); } else { return nullptr; } } return new VulkanConvolutionDepthwise(source, weightSize, op, backend); } }; static bool gResistor = []() { VulkanBackend::addCreator(OpType_Convolution, new VulkanConvolutionCreator); VulkanBackend::addCreator(OpType_ConvolutionDepthwise, new VulkanConvolutionCreator); return true; }();
1.1.1.1.1.1.2 VulkanBasicExecution
VulkanBasicExecution 为 Vulkan 算子基类,其具体实现如下:
// source/backend/vulkan/image/execution/VulkanBasicExecution.hpp class VulkanBasicExecution { public: VulkanBasicExecution(Backend *bn) : mBackend(bn) { //Do nothing } virtual ~VulkanBasicExecution() = default; virtual ErrorCode onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const VulkanCommandPool::Buffer *cmdBuffer) = 0; Backend* backend() { return mBackend; } private: Backend* mBackend; };
1.1.1.1.1.1.3 Vulkan 算子执行实例注册
VulkanBackend::onCreate 函数中有个 gCreator 成员,其缓存了所有的 Vulkan 算子执行创建实例 VulkanBackend::Creator,其初始化分布在各个 VulkanBackend::Creator 实例代码中,采用静态初始化执行的方式进行注册。
// source/backend/vulkan/image/execution/VulkanConvolution.cpp static bool gResistor = []() { VulkanBackend::addCreator(OpType_Convolution, new VulkanConvolutionCreator); VulkanBackend::addCreator(OpType_ConvolutionDepthwise, new VulkanConvolutionCreator); return true; }();
VulkanBackend::addCreator 实现如下:
// source/backend/vulkan/image/backend/VulkanBackend.cpp bool VulkanBackend::addCreator(OpType t, Creator* c) { auto allKind = getCreatorMap(); allKind->insert(std::make_pair(t, c)); return true; } static std::map<OpType, VulkanBackend::Creator*>* gCreator = nullptr; // Creator static inline std::map<OpType, VulkanBackend::Creator*>* getCreatorMap() { if (nullptr == gCreator) { gCreator = new std::map<OpType, VulkanBackend::Creator*>(); } return gCreator; }
添加Vulkan实现
- 添加Shader
在source/backend/vulkan/execution/glsl
目录下添加具体的shader(*.comp)。若输入内存布局为NC4HW4
,则按image
实现,否则采用buffer实现。可以参考目录下已有实现。然后,执行makeshader.py
脚本编译Shader。 - 实现类声明
在目录source/backend/vulkan/execution/
下添加VulkanMyCustomOp.hpp
和VulkanMyCustomOp.cpp
:
class VulkanMyCustomOp : public VulkanBasicExecution { public: VulkanMyCustomOp(const Op* op, Backend* bn); virtual ~VulkanMyCustomOp(); ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const VulkanCommandPool::Buffer* cmdBuffer) override; private: // GPU Shader所需的参数 std::shared_ptr<VulkanBuffer> mConstBuffer; // Pipeline const VulkanPipeline* mPipeline; // Layout Descriptor Set std::shared_ptr<VulkanPipeline::DescriptorSet> mDescriptorSet; };
- 实现
实现函数onEncode
,首先需要做内存布局检查:若为NC4HW4
,则Shader用image实现,否则用buffer。执行完毕返回NO_ERROR。 - 注册实现类
class VulkanMyCustomOpCreator : public VulkanBackend::Creator { public: virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const MNN::Op* op, Backend* backend) const override { return new VulkanMyCustomOp(op, backend); } }; static bool gResistor = []() { VulkanBackend::addCreator(OpType_MyCustomOp, new VulkanMyCustomOpCreator); return true; }();
添加Metal实现
- 添加Shader
在source/backend/Metal
目录下添加MetalMyCustomOp.metal
,并添加进Xcode工程。metal可以参考目录下已有实现。 - 实现类声明
在source/backend/Metal
目录下添加MetalMyCustomOp.hpp
和MetalMyCustomOp.cpp
,并添加进Xcode工程:
class MetalMyCustomOp : public Execution { public: virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; };
- 实现
onResize
和onExecute
不同于CPU Tensor将数据存储在host指针中,Metal数据指针存放在deviceId
中,deviceId上存储的是id<MTLBuffer>
:
auto buffer = (__bridge id<MTLBuffer>)(void *)tensor->deviceId();
Metal Op的特定参数等可以通过id<MTLBuffer>
存储。buffer数据类型可以与tensor不同,buffer甚至可以混合多种数据类型,只需保证创建时指定了正确的长度即可。例如:
auto buffer = [context newDeviceBuffer:2 * sizeof(int) + 2 * sizeof(__fp16) access:CPUWriteOnly]; ((__fp16 *)buffer.contents)[0] = mAlpha / mLocalSize; // alpha ((__fp16 *)buffer.contents)[1] = mBeta; // beta ((int *)buffer.contents)[1] = mLocalSize; // local size ((int *)buffer.contents)[2] = inputs[0]->channel(); // channel
在创建buffer时,需要指定访问控制权限。目前共有三种权限:
CPUReadWrite
,数据在CPU/GPU间共享存储,一般用于device buffer;CPUWriteOnly
,数据通过CPU写入后不再读取,一般用于参数buffer;
CPUTransparent
,数据只在GPU中,一般用于heap buffer;
MNNMetalContext在创建buffer上,有两套相近的接口,区别只在数据的生命周期上:
- device占用的内存在单次推理过程中都不会被复用;
- 而heap占用的内存,在调用
-[MNNMetalContext releaseHeapBuffer:]
之后,可以被其他Op复用;
一般而言,heap只会与CPUTransparent一起使用。heap实际只在iOS 10+上有效,iOS 9-上会回退到device上。
使用Metal时,如非特殊情况,禁止自行创建device和library。加载library、编译function都是耗时行为,MNNMetalContext上做了必要的缓存优化。通过context执行Metal的示例如下:
auto context = (__bridge MNNMetalContext *)backend->context(); auto kernel = /* metal kernel name NSString */; auto encoder = [context encoder]; auto bandwidth = [context load:kernel encoder:encoder]; /* encoder set buffer(s)/sampler(s) */ [context dispatchEncoder:encoder threads:{x, y, z} maxThreadsPerGroup:maxThreadsPerThreadgroup]; // recommended way to dispatch [encoder endEncoding];
- 注册实现类
class MetalMyCustomOpCreator : public MetalBackend::Creator { public: virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend) const { return new MetalMyCustomOp(backend); } }; REGISTER_METAL_OP_CREATOR(MetalMyCustomOpCreator, OpType_MyCustomOp);
添加注册代码后,重新运行一下 CMake ,自动变更注册文件
添加OpenCL实现
- 添加Kernel
在source/backend/opencl/execution/cl
目录添加具体的kernel(*.cl)。目前feature map均使用image2d
实现。可以参考目录下已有实现。然后执行opencl_codegen.py
来生成kernel映射。
- 实现类声明
在目录source/backend/opencl/execution/
下添加MyCustomOp.h
和MyCustomOp.cpp
:
template <typename T> class MyCustomOp : public Execution { public: virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; };
- 实现
实现函数onResize
(可选)、onExecute
。执行完毕返回NO_ERROR。 - 注册实现类
OpenCLCreatorRegister<TypedCreator<MyCustomOp<cl_data_t>>> __my_custom_op(OpType_MyCustomOp);
添加OpenGL实现
- 添加Shader
在source/backend/opengl/glsl
下添加具体的shader(*.glsl),不用加文件头,feature map 均采用image3d
表示。可以参考目录下已有实现。而后,在source/backend/opengl
目录下执行makeshader.py
。 - 添加Executor
在source/backend/opengl/execution/
目录下添加GLMyCustomOp.h
和GLMyCustomOp.cpp
:
class GLMyCustomOp : public Execution { public: GLMyCustomOp(const std::vector<Tensor *> &inputs, const Op *op, Backend *bn); virtual ~GLMyCustomOp(); virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; private: std::shared_ptr<GLProgram> mProgram; };
- 实现
实现函数onResize
(可选)、onExecute
。执行完毕返回NO_ERROR。 - 注册实现类-
GLCreatorRegister<TypedCreator<GLMyCustomOp>> __my_custom_op(OpType_MyCustomOp);
1.1.1.1.1.1.4 VulkanBasicExecutionDirect
// source/backend/vulkan/image/execution/VulkanBasicExecution.hpp class VulkanBasicExecutionDirect : public Execution { public: VulkanBasicExecutionDirect(std::shared_ptr<VulkanBasicExecution> encoder); virtual ~ VulkanBasicExecutionDirect() = default; virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; private: std::shared_ptr<VulkanBasicExecution> mEncoder; std::shared_ptr<VulkanCommandPool::Buffer> mCmdBuffer; };
1.1.1.1.1.1.5 VulkanBasicExecutionDirect::VulkanBasicExecutionDirect
// source/backend/vulkan/image/execution/VulkanBasicExecution.cpp VulkanBasicExecutionDirect::VulkanBasicExecutionDirect(std::shared_ptr<VulkanBasicExecution> encoder) : Execution(encoder->backend()) { mEncoder = encoder; auto extra = static_cast<VulkanBackend *>(encoder->backend()); mCmdBuffer.reset(const_cast<VulkanCommandPool::Buffer *>(extra->getPool().allocBuffer())); }
1.1.1.1.1.1.6 VulkanBasicExecutionInDirect
// source/backend/vulkan/image/execution/VulkanBasicExecution.hpp class VulkanBasicExecutionInDirect : public Execution { public: VulkanBasicExecutionInDirect(std::shared_ptr<VulkanBasicExecution> encoder); virtual ~ VulkanBasicExecutionInDirect() = default; virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override { return NO_ERROR; } virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; private: std::shared_ptr<VulkanBasicExecution> mEncoder; };
1.1.1.1.1.2 CPUBackend::onCreate
在函数 _createExecutions 中先调用 VulkanBackend::onCreate 创建算子执行器,没创建成功则采用后备 CPUBackend::onCreate 进行创建。
if (nullptr == iter.execution) { // 先使用指定的 Backend(如 CPUBackend )创建 // iter 类型为 Command // iter.execution 类型为 std::shared_ptr<Execution> execution; iter.execution.reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.op)); } if (nullptr == iter.execution) { // Try Backup // 没创建成功则使用后备 Backend(如 VulkanBackend )创建 iter.execution.reset(mBackupBackend->onCreate(iter.inputs, iter.outputs, iter.op)); if (nullptr == iter.execution) { if (mInfo.first.reportError) { MNN_ERROR("Create execution error : %d\n", iter.op->type()); } return NOT_SUPPORT; } }
关于 CPUBackend::onCreate 的分析见传送门。
1.1.1.1.2 Backend::onResizeBegin
在函数 Pipeline::allocMemory 中调用 VulkanBackend::onResizeBegin 和 CPUBackend::onResizeBegin 函数的代码如下:
mBackend->onResizeBegin(); mBackupBackend->onResizeBegin();
onResizeBegin 函数是个虚函数,由于 mBackend 是 VulkanBackend(继承 Backend) ,所以实际调用的是 VulkanBackend::onResizeBegin,其具体实现代码如下:
// source/backend/vulkan/image/backend/VulkanBackend.cpp void VulkanBackend::onResizeBegin() { // 开始录制 mInitBuffer mInitBuffer->begin(0); if (!mDirect) { mCmdBuffer->begin(0); } }
mBackupBackend 是 CPUBackend(继承 Backend),其具体实现见传送门。
1.1.1.1.2.1 VulkanCommandPool::Buffer::begin
在函数 VulkanBackend::onResizeBegin 中调用 VulkanCommandPool::Buffer::begin 函数的代码如下:
mInitBuffer->begin(0);
其实现代码如下:
void VulkanCommandPool::Buffer::begin(VkCommandBufferUsageFlags flag) const { VkCommandBufferBeginInfo cmdBufferBeginInfo{ /* .sType = */ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, /* .pNext = */ nullptr, /* .flags = */ flag, /* .pInheritanceInfo = */ nullptr, }; vkResetCommandBuffer(mBuffer, 0); CALL_VK(vkBeginCommandBuffer(mBuffer, &cmdBufferBeginInfo)); }
1.1.1.1.3 _allocTensor
在函数 Pipeline::allocMemory 中调用 _allocTensor 函数的代码如下:
for (auto t : iter.workInputs) { auto allocRes = _allocTensor(t, curBackend, mOutputStatic); if (!allocRes) { return OUT_OF_MEMORY; } }
其实现代码如下:
// source/core/Pipeline.cpp static bool _allocTensor(Tensor* t, Backend* curBackend, bool outputStatic) { auto memoryType = _getTensorStorageType(t, outputStatic); auto bn = TensorUtils::getDescribe(t)->getBackend(); auto des = TensorUtils::getDescribe(t); if (nullptr == des->mem.get()) { MNN_ASSERT(des->memoryType != Tensor::InsideDescribe::MEMORY_VIRTUAL); TensorUtils::setLinearLayout(t); auto res = curBackend->onAcquireBuffer(t, memoryType); return res; } return true; }
1.1.1.1.3.1 Backend::onAcquireBuffer
在函数 _allocTensor 中调用 Backend::onAcquireBuffer 函数的代码如下:
auto res = curBackend->onAcquireBuffer(t, memoryType);
onAcquireBuffer 只存在 Backend 基类中,其主要用来为张量 tensor 分配内存,具体实现代码如下:
bool Backend::onAcquireBuffer(const Tensor* tensor, StorageType storageType) { auto mem = this->onAcquire(tensor, storageType); if (nullptr == mem) { return false; } if (mem == TensorUtils::getDescribe(tensor)->mem.get()) { return true; } TensorUtils::getDescribe(tensor)->mem.reset(mem); return true; }
onAcquireBuffer 函数中调用 onAcquire 函数,这是个虚函数,由于传入的 curBackend 是 VulkanBackend(继承 Backend),所以实际调用的是 VulkanBackend::onAcquire,其具体实现代码如下:
// source/backend/vulkan/image/backend/VulkanBackend.cpp Backend::MemObj* VulkanBackend::onAcquire(const Tensor* tensor, StorageType storageType) { //FUNC_PRINT_ALL(tensor, p); auto MTensor = const_cast<Tensor*>(tensor); auto format = _getFormat(tensor->getType()); if (Backend::STATIC == storageType) { auto newBuffer = std::make_shared<VulkanTensor>(MTensor, format, getMemoryPool(), device().proty().limits); MTensor->buffer().device = (uint64_t)(newBuffer.get()); return new VulkanMemRelease(newBuffer); } bool separate = storageType == Backend::DYNAMIC_SEPERATE; auto newBuffer = std::make_shared<VulkanTensor>(MTensor, format, getDynamicMemoryPool(), device().proty().limits, separate); MTensor->buffer().device = (uint64_t)(newBuffer.get()); mAllBuffers.insert(std::make_pair(MTensor->buffer().device, newBuffer)); return new VulkanMemRelease(newBuffer);; }
1.1.1.1.4 Execution::onResize
在函数 Pipeline::allocMemory 中调用 Execution::onResize 函数的代码如下:
// iter 类型为 Command // iter.execution 类型为 std::shared_ptr<Execution> auto code = iter.execution->onResize(iter.workInputs, iter.workOutputs);
onResize 函数是个虚函数, iter.execution 创建逻辑见 VulkanBackend::onCreate ,iter.execution->onResize 调用是个多态,其基类为 Execution。Vulkan 创建的算子执行实例都是 VulkanBasicExecution 的子类,而其会采用 VulkanBasicExecutionDirect 或者
VulkanBasicExecutionInDirect 进行包装, 我们选择一个实例 VulkanBasicExecutionDirect 进行分析,其具体实现代码如下:
// source/backend/vulkan/image/execution/VulkanBasicExecution.cpp ErrorCode VulkanBasicExecutionDirect::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { // VulkanBackend 中的成员变量 // std::shared_ptr<VulkanCommandPool::Buffer> mCmdBuffer; // std::shared_ptr<VulkanCommandPool::Buffer> mInitBuffer; // mutable std::vector<VkCommandBuffer> mCmdBuffers; auto initCmdBuffer = static_cast<VulkanBackend*>(backend())->getInitCommandBuffer(); _initLayout(inputs, outputs, initCmdBuffer); // 开始录制指令 mCmdBuffer->begin(0); // 录制指令 auto code = mEncoder->onEncode(inputs, outputs, mCmdBuffer.get()); for (auto output : outputs) { auto vkTensor = reinterpret_cast<VulkanTensor*>(output->deviceId()); for (int i=0; i<vkTensor->imageSize(); ++i) { auto img = vkTensor->image(i); img->barrierRead(mCmdBuffer->get()); } } _postTreat(outputs, mCmdBuffer.get()); // 结束录制指令 mCmdBuffer->end(); #ifdef MNN_VULKAN_DEBUG #ifdef MNN_VULKAN_DEBUG_EAGER static_cast<VulkanBackend*>(backend())->onExecuteBegin(); static_cast<VulkanBackend*>(backend())->pushCommand(mCmdBuffer->get()); static_cast<VulkanBackend*>(backend())->onExecuteEnd(); #endif #endif return code; }
录制命令用来进行算子运算,基于 Execution 基类。
1.1.1.1.4.1 VulkanBasicExecution::onEncode
在函数 VulkanBasicExecutionDirect::onResize 中调用 VulkanBasicExecution::onEncode 函数的代码如下:
auto code = mEncoder->onEncode(inputs, outputs, mCmdBuffer.get());
onEncode 函数是个虚函数,mEncoder 的基类为 VulkanBasicExecution,我们选择一个实例 VulkanConvolutionIm2Col 进行分析,其具体实现代码如下:
// source/backend/vulkan/image/execution/VulkanConvolutionImpl.cpp class VulkanConvolutionIm2Col : public VulkanBasicExecution { virtual ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const VulkanCommandPool::Buffer* cmdBuffer) override { auto src = inputs[0]; auto dst = outputs[0]; const int icDiv4 = UP_DIV(src->channel(), 4); const int ocDiv4 = UP_DIV(dst->channel(), 4); auto vkBn = (VulkanBackend*)backend(); int limit = vkBn->proty().limits.maxImageDimension2D * 4; #ifdef VULKAN_IM2COL_GEMM_UNIT limit = VULKAN_IM2COL_GEMM_UNIT; #endif if (limit < dst->width()) { MNN_ERROR("Don't support width too large feature: %d x %d, limit = %d\n", dst->width(), dst->height(), limit); return NOT_SUPPORT; } int batchLoopNumber = 1; int heightLoopNumber = 1; int unitHeight = dst->height(); int unitBatch = dst->batch(); auto area = dst->width() * dst->height(); if (limit < area) { batchLoopNumber = dst->batch(); unitBatch = 1; unitHeight = limit / dst->width(); heightLoopNumber = UP_DIV(dst->height(), unitHeight); } else if (limit < area * dst->batch()) { unitBatch = limit / area; batchLoopNumber = UP_DIV(dst->batch(), unitBatch); } int loopNumber = batchLoopNumber * heightLoopNumber; mConvParams.resize(loopNumber); mMultilers.resize(loopNumber); mIm2ColSet.resize(loopNumber); mCol2ImSet.resize(loopNumber); reinterpret_cast<VulkanTensor*>(src->deviceId())->image()->barrierRead(cmdBuffer->get()); reinterpret_cast<VulkanTensor*>(dst->deviceId())->image()->barrierWrite(cmdBuffer->get()); for (int i=0; i<batchLoopNumber; ++i) { int batchOffset = i * unitBatch; int currentBatch = dst->batch() - batchOffset; if (currentBatch > unitBatch) { currentBatch = unitBatch; } for (int j=0; j<heightLoopNumber; ++j) { int heightOffset = j * unitHeight; int currentHeight = dst->height() - heightOffset; if (currentHeight > unitHeight) { currentHeight = unitHeight; } auto index = i * heightLoopNumber + j; auto totalNumberInput = currentBatch * icDiv4 * dst->width() * currentHeight; auto totalNumberOutput = currentBatch * ocDiv4 * dst->width() * currentHeight; mConvParams[index] = std::make_shared<VulkanBuffer>(vkBn->getMemoryPool(), false, sizeof(VulkanConvolutionCommon::ConvolutionParameter), nullptr, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); { auto convCons = reinterpret_cast<VulkanConvolutionCommon::ConvolutionParameter*>(mConvParams[index]->map()); VulkanConvolutionCommon::writeParameter(convCons, mConvCommonOption, src, dst); convCons->offset[0] = batchOffset; convCons->offset[1] = heightOffset; convCons->outputSize[3] = currentBatch; convCons->outputSize[1] = currentHeight; mConvParams[index]->unmap(); } mIm2ColSet[index].reset(mIm2Col->createSet()); mCol2ImSet[index].reset(mCol2Im->createSet()); mMultilers[index] = mMultiCreator(); mMultilers[index]->prepare(static_cast<VulkanBackend*>(backend())->getInitCommandBuffer(), dst->width() * currentHeight * currentBatch); auto mMultiler = mMultilers[index].get(); if (true) { auto colImage = mMultiler->source(); // Barrier mIm2ColSet[index]->writeImage(colImage->view(), mSampler->get(), VK_IMAGE_LAYOUT_GENERAL, 0); mIm2ColSet[index]->writeImage((reinterpret_cast<VulkanTensor*>(src->deviceId()))->image()->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 1); mIm2ColSet[index]->writeBuffer(mConvParams[index]->buffer(), 2, mConvParams[index]->size()); mIm2Col->bind(cmdBuffer->get(), mIm2ColSet[index]->get()); colImage->barrierWrite(cmdBuffer->get()); vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalNumberInput, VulkanConvolutionCommon::gImage2ColLocal), 1, 1); } mMultilers[index]->compute(cmdBuffer); if (true) { auto dstImage = mMultiler->dest(); mCol2ImSet[index]->writeImage(dstImage->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 0); mCol2ImSet[index]->writeImage((reinterpret_cast<VulkanTensor*>(dst->deviceId()))->image()->view(), mSampler->get(), VK_IMAGE_LAYOUT_GENERAL, 1); mCol2ImSet[index]->writeImage(mBias->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 2); mCol2ImSet[index]->writeBuffer(mConvParams[index]->buffer(), 3, mConvParams[index]->size()); mCol2Im->bind(cmdBuffer->get(), mCol2ImSet[index]->get()); dstImage->barrierRead(cmdBuffer->get()); mBias->barrierRead(cmdBuffer->get()); vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalNumberOutput, VulkanConvolutionCommon::gImage2ColLocal), 1, 1); } } } return NO_ERROR; } }
1.1.1.1.4.2 VulkanCommandPool 与 Buffer
// source/backend/vulkan/component/VulkanCommandPool.hpp class VulkanCommandPool : public NonCopyable { public: VulkanCommandPool(const VulkanDevice& dev); virtual ~VulkanCommandPool(); class Buffer : public NonCopyable { public: Buffer(const VulkanCommandPool* pool); virtual ~Buffer(); VkCommandBuffer get() const { return mBuffer; } void begin(VkCommandBufferUsageFlags flags) const; void end() const; enum BarrierType { READ_WRITE = 0, WRITE_WRITE, WRITE_READ, }; void barrierSource(VkBuffer source, size_t start, size_t end, BarrierType type = READ_WRITE) const; void barrierSource(std::tuple<VkBuffer, VkDeviceSize, VkDeviceSize>, BarrierType type = READ_WRITE) const; private: VkCommandBuffer mBuffer; const VulkanCommandPool* mPool; }; VulkanCommandPool::Buffer* allocBuffer() const; VkCommandPool pool() const { return mPool; } void submitAndWait(VkCommandBuffer buffer) const; private: const VulkanDevice& mDevice; VkCommandPool mPool; mutable std::vector<VkCommandBuffer> mFreeBuffers; }; } // source/backend/vulkan/component/VulkanCommandPool.cpp void VulkanCommandPool::Buffer::begin(VkCommandBufferUsageFlags flag) const { VkCommandBufferBeginInfo cmdBufferBeginInfo{ /* .sType = */ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, /* .pNext = */ nullptr, /* .flags = */ flag, /* .pInheritanceInfo = */ nullptr, }; vkResetCommandBuffer(mBuffer, 0); CALL_VK(vkBeginCommandBuffer(mBuffer, &cmdBufferBeginInfo)); } void VulkanCommandPool::Buffer::end() const { CALL_VK(vkEndCommandBuffer(mBuffer)); }
1.1.1.1.5 Backend::onResizeEnd
在函数 Pipeline::allocMemory 中调用 VulkanBackend::onResizeEnd 和 CPUBackend::onResizeEnd 函数的代码如下:
auto code = mBackend->onResizeEnd(); if (code != NO_ERROR) { return code; } code = mBackupBackend->onResizeEnd();
onResizeEnd 函数是个虚函数,由于 mBackend 是 VulkanBackend(继承 Backend) ,所以实际调用的是 VulkanBackend::onResizeEnd,其具体实现代码如下:
// source/backend/vulkan/image/backend/VulkanBackend.cpp ErrorCode VulkanBackend::onResizeEnd() { if (!mDirect) { mCmdBuffer->end(); } // 结束 mInitBuffer 指令录制 mInitBuffer->end(); mCmdBuffers.emplace_back(mInitBuffer->get()); // 提交刚录制的 mInitBuffer 即 mCmdBuffers _finish(); return NO_ERROR; }
mBackupBackend 是 CPUBackend(继承 Backend),其具体实现见传送门。
1.1.1.1.5.1 _finish
// source/backend/vulkan/image/backend/VulkanBackend.cpp void VulkanBackend::_finish() const { if (mCmdBuffers.empty()) { return; } VkSubmitInfo submit_info = {/* .sType = */ VK_STRUCTURE_TYPE_SUBMIT_INFO, /* .pNext = */ nullptr, /* .waitSemaphoreCount = */ 0, /* .pWaitSemaphores = */ nullptr, /* .pWaitDstStageMask = */ nullptr, /* .commandBufferCount = */ (uint32_t)mCmdBuffers.size(), /* .pCommandBuffers = */ mCmdBuffers.data(), /* .signalSemaphoreCount = */ 0, /* .pSignalSemaphores = */ nullptr}; auto fenceReal = mFence->get(); mFence->reset(); CALL_VK(vkQueueSubmit(device().acquireDefaultDevQueue(), 1, &submit_info, fenceReal)); auto res = mFence->wait(); MNN_VK_CHECK(res); mCmdBuffers.clear(); }
各算子的指令缓存执行见 传送门
☆