MNN Session 之 Vulkan 算子(八)

简介: MNN Session 之 Vulkan 算子(八)

1、createSession

    依据 ScheduleConfig 和 RuntimeInfo 创建会话。

// source/core/Interpreter.cpp
Session* Interpreter::createSession(const ScheduleConfig& config, const RuntimeInfo& runtime) {
    return createMultiPathSession({config}, runtime);
}


1.1 createMultiPathSession

createMultiPathSession 完整代码

// source/core/Interpreter.cpp
Session* Interpreter::createMultiPathSession(const std::vector<ScheduleConfig>& configs, const RuntimeInfo& runtime) {
  // ...
    auto result = newSession.get();
    auto validForResize = info.validForResize;
    if (validForResize && mNet->modes.inputMode == Session_Input_Inside && mNet->modes.resizeMode == Session_Resize_Direct) {
        result->resize();
    }
    // ...
    return result;
}

1.1.1 Session::resize

Session::resize 完整代码

// source/core/Session.cpp
ErrorCode Session::resize() {
  // ...
    if (mNeedMalloc) {
        // Set needResize = true for easy for judge in runSession when error
        mNeedResize = true;
        // Turn Pipeline to Command Buffer and Malloc resource
        // TODO: Separate Schedule and Malloc
        bool forbidReplace = permitCodegen;
        if (mInfo.constReplaceBackend != nullptr) {
            forbidReplace = true;
        }
        for (auto& iter : mPipelines) {
            auto error = iter->allocMemory(firstMalloc, forbidReplace);
            if (NO_ERROR != error) {
                return error;
            }
        }
        // ...
        mNeedMalloc = false;
        mNeedResize = false;
    }
  // ...
    return NO_ERROR;
}

1.1.1.1 Pipeline::allocMemory

OpCacheInfo BackendCacheCommandCommandBuffer

// source/core/Pipeline.cpp
// typedef std::pair<BackendCache, std::vector<OpCacheInfo>> PipelineInfo;
//
//   struct BackendCache {
//      Backend::Info info;
//      BackendConfig config;
//      std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>> cache;
//      bool needComputeShape = true;
//      bool needComputeGeometry = true;
//      bool reportError = true;
//      std::map<Tensor*, TENSORCACHE> inputTensorCopyCache;
//  };
//
//    /** pipeline info */
//    struct OpCacheInfo {
//        /** op */
//        const Op* op;
//        /** input tensors */
//        std::vector<Tensor*> inputs;
//        /** output tensors */
//        std::vector<Tensor*> outputs;
//        /** schedule type*/
//        Schedule::Type type = Schedule::Type::SEPARATE;
//
//        /**Command buffer for cache*/
//        CommandBuffer cacheBuffer;
//
//        /**Command buffer for execute*/
//        CommandBuffer executeBuffer;
//        
//        std::map<const Op*, std::shared_ptr<Execution>> executionCache;
//    };
// 
// struct Command : public RefCount {
//     const Op* op;
//     std::vector<Tensor*> workInputs;
//     std::vector<Tensor*> workOutputs;
//     std::vector<Tensor*> inputs;
//     std::vector<Tensor*> outputs;
//     std::shared_ptr<BufferStorage> buffer;
//     std::shared_ptr<Execution> execution;
//     std::shared_ptr<OperatorInfo> info;
//     #ifdef MNN_BUILD_CODEGEN
//     bool canVectorize = false;
//     #endif
// };
// 
// struct CommandBuffer {
//     std::vector<SharedPtr<Command>> command;
//     std::vector<std::shared_ptr<Tensor>> extras;
//     bool hasWrap = false;
// };
ErrorCode Pipeline::allocMemory(bool firstMalloc, bool forbidReplace) {
    // MNN_PRINT("allocMemory mtype:%d, cpubackendType:%d, cpuBackend runtime:%p\n", mBackend->type(), mBackupBackend->type(), mBackupBackend->getRuntime());
    if (!firstMalloc) {
        // For session setNeedMalloc, if session's output is set as some input, It may cause error
        // Dup des to avoid it
        for (auto& info : mInfo.second) {
            auto& buffer = info.executeBuffer;
            for (const auto& infoP : buffer.command) {
                auto& info = *infoP;
                for (auto t : info.workOutputs) {
                    if (!TensorUtils::getDescribe(t)->isMutable) {
                        continue;
                    }
                    auto des = TensorUtils::getDescribe(t);
                    auto usage = des->usage;
                    if (TensorUtils::getDescribeOrigin(t)->mContent->count() > 1) {
                        TensorUtils::getDescribeOrigin(t)->mContent = new Tensor::InsideDescribe::NativeInsideDescribe;
                        auto dstDes = TensorUtils::getDescribe(t);
                        t->buffer().dim = dstDes->dims;
                        ::memcpy(t->buffer().dim, des->dims, MNN_MAX_TENSOR_DIM * sizeof(halide_dimension_t));
                        dstDes->dimensionFormat = des->dimensionFormat;
                        dstDes->usage = usage;
                        dstDes->regions = des->regions;
                        dstDes->quantAttr = des->quantAttr;
                        dstDes->tensorArrayAttr = des->tensorArrayAttr;
                    }
                }
            }
        }
    }

  // mInfo 类型为 typedef std::pair<BackendCache, std::vector<OpCacheInfo>> PipelineInfo;
  // 开始创建 Execution 
    /* Create Execution Begin */
    // mInfo.first 类型为 BackendCache
    // mInfo.first.cache.first 类型为 std::shared_ptr<Backend>,即主动创建的 VulkanBackend
    auto& mBackend = mInfo.first.cache.first;
    // mInfo.first.cache.second 类型为 std::shared_ptr<Backend>,即后备 CPUBackend
    auto& mBackupBackend = mInfo.first.cache.second;
    mBackend->onClearBuffer();
    mBackupBackend->onClearBuffer();
    // Check If we need a lone time for init
    if (mBackend->type() != MNN_FORWARD_CPU && mBackend->type() != MNN_FORWARD_CPU_EXTENSION && mTuneAttr.autoSetOpType) {
        Runtime::OpInfo dstInfo;
        int currentInitCount = 0;
        std::vector<Schedule::OpCacheInfo> initInfos;
        for (auto& info : mInfo.second) {
            auto& buffer = info.executeBuffer;
            for (auto& iterP : buffer.command) {
                auto& iter = *iterP;
                dstInfo.initCostLong = false;
                mRuntime->onMeasure(iter.inputs, iter.outputs, iter.op, dstInfo);
                if (dstInfo.initCostLong) {
                    initInfos.emplace_back(info);
                    currentInitCount++;
                    break;
                }
            }
            if (currentInitCount >= mTuneAttr.maxTuningNumber) {
                break;
            }
        }
        if (currentInitCount > 0) {
            MNN_PRINT("Turn back to cpu\n");
            // Reset execution
            for (auto& info : mInfo.second) {
                info.executionCache.clear();
                for (auto& iterP : info.executeBuffer.command) {
                    iterP->execution = nullptr;
                    iterP->execution = nullptr;
                    _recycleDynamicMemory(iterP.get());
                }
            }
            if (!mRuntime->hasAsyncWork()) {
                _pushTuningTask(std::move(initInfos));
            }
            mBackend.reset(mCpuRuntime->onCreate(nullptr));
        }
    }
    {
      // 创建 Execution 
        auto code = _createExecutions(mInfo);
        if (NO_ERROR != code) {
            return code;
        }
    }
    /* Create Execution End */

    _SetTensorBackend(mInfo, mAllocInput);
    // Insert Wrap If needed
    {
        auto insertCode = _InsertCopy(mInfo, mCacheConstTensors, mShapeFixConstCache, mAllocInput, forbidReplace);
        if (NO_ERROR != insertCode) {
            return insertCode;
        }
    }
    /* Insert Wrap End*/

    // Compute RefCount Begin
    for (auto& info : mInfo.second) {
        auto& buffer = info.executeBuffer;
        // MNN_PRINT("before resize, mInfo.second size:%lu, command size:%lu,op type:%s, op name:%s\n", mInfo.second.size(), buffer.command.size(), EnumNameOpType(info.op->type()), info.op->name()->c_str());
        for (auto& iterP : buffer.command) {
            auto& iter = *iterP;
            for (auto t : iter.workInputs) {
                auto des = TensorUtils::getDescribe(t);
                if (des->usage != Tensor::InsideDescribe::CONSTANT) {
                    des->useCount = 0;
                }
            }
        }
    }
    for (auto& info : mInfo.second) {
        auto& buffer = info.executeBuffer;
        for (auto& iterP : buffer.command) {
            auto& iter = *iterP;
            for (auto t : iter.workInputs) {
                auto des = TensorUtils::getDescribe(t);
                if (des->usage != Tensor::InsideDescribe::CONSTANT) {
                    des->useCount += 1;
                }
            }
        }
    }
    // Compute RefCount End

    // Alloc tensor
    mBackend->onResizeBegin();
    mBackupBackend->onResizeBegin();
    // mInfo.first 类型为 BackendCache
    // mInfo.second 类型为 std::vector<OpCacheInfo>
    for (auto& info : mInfo.second) {
      // info.executeBuffer 类型为 CommandBuffer
        auto& buffer = info.executeBuffer;
        // buffer.command 类型为 std::vector<SharedPtr<Command>>
        for (int cmdIndex=0; cmdIndex < buffer.command.size(); ++cmdIndex) {
            auto& iterP = buffer.command[cmdIndex];
            auto& iter = *iterP;
#ifdef MNN_PIPELINE_DEBUG
            auto memory = const_cast<Runtime*>(mRuntime)->onGetMemoryInMB();
            if (nullptr != info.op->name()) {
                MNN_PRINT("%f, before Resize: %s - %d\n", memory, info.op->name()->c_str(), cmdIndex);
            }
#endif

            // MNN_PRINT("before Resize: optype:%s, name:%s, input0:%p, output0:%p, mAllocInput:%d\n", EnumNameOpType(iter.op->type()), iter.info->name().c_str(), iter.inputs[0], iter.outputs[0], mAllocInput);
            // Alloc for Tensors        
            // iter 类型为 Command
            // iter.execution 类型为 std::shared_ptr<Execution>
            auto curBackend = iter.execution->backend();
            if (mAllocInput) {
                for (auto t : iter.workInputs) {
                    auto allocRes = _allocTensor(t, curBackend, mOutputStatic);
                    if (!allocRes) {
                        return OUT_OF_MEMORY;
                    }
                }
            }
            {
                for (auto t : iter.workOutputs) {
                    auto res = _allocTensor(t, curBackend, mOutputStatic);
                    if (!res) {
                        return OUT_OF_MEMORY;
                    }
                }
            }
#ifdef MNN_PIPELINE_DEBUG
            if (iter.info != nullptr) {
                MNN_PRINT("before Resize 2, calling: %s - %d \n", iter.info->name().c_str(), cmdIndex);
            }
#endif
      // iter.execution 类型为 std::shared_ptr<Execution>
            auto code = iter.execution->onResize(iter.workInputs, iter.workOutputs);
            if (NO_ERROR != code) {
#ifdef MNN_PIPELINE_DEBUG
                MNN_ERROR("Pipeline Resize error: %d\n", code);
#endif
                if (iter.info.get()) {
                    MNN_ERROR("Resize error for type = %s, name = %s \n", iter.info->type().c_str(), iter.info->name().c_str());
                }
                return code;
            }
            // Free mid tensor
            for (auto t : iter.workInputs) {
                _releaseTensor(t, mAllocInput);
            }
        }
    }
    // Recycle All Dynamic Tensor
    for (auto& info : mInfo.second) {
        auto& buffer = info.executeBuffer;
        for (auto& c : buffer.command) {
            _recycleDynamicMemory(c.get());
        }
    }
    auto code = mBackend->onResizeEnd();
    if (code != NO_ERROR) {
        return code;
    }
    code = mBackupBackend->onResizeEnd();
    return code;
}

1.1.1.1.1 _createExecutions

OpCacheInfo BackendCacheCommandCommandBuffer

// source/core/Pipeline.cpp
// typedef std::pair<BackendCache, std::vector<OpCacheInfo>> PipelineInfo;
//
//   struct BackendCache {
//      Backend::Info info;
//      BackendConfig config;
//      std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>> cache;
//      bool needComputeShape = true;
//      bool needComputeGeometry = true;
//      bool reportError = true;
//      std::map<Tensor*, TENSORCACHE> inputTensorCopyCache;
//  };
//
//    /** pipeline info */
//    struct OpCacheInfo {
//        /** op */
//        const Op* op;
//        /** input tensors */
//        std::vector<Tensor*> inputs;
//        /** output tensors */
//        std::vector<Tensor*> outputs;
//        /** schedule type*/
//        Schedule::Type type = Schedule::Type::SEPARATE;
//
//        /**Command buffer for cache*/
//        CommandBuffer cacheBuffer;
//
//        /**Command buffer for execute*/
//        CommandBuffer executeBuffer;
//        
//        std::map<const Op*, std::shared_ptr<Execution>> executionCache;
//    };
//
// struct Command : public RefCount {
//     const Op* op;
//     std::vector<Tensor*> workInputs;
//     std::vector<Tensor*> workOutputs;
//     std::vector<Tensor*> inputs;
//     std::vector<Tensor*> outputs;
//     std::shared_ptr<BufferStorage> buffer;
//     std::shared_ptr<Execution> execution;
//     std::shared_ptr<OperatorInfo> info;
//     #ifdef MNN_BUILD_CODEGEN
//     bool canVectorize = false;
//     #endif
// };
// 
// struct CommandBuffer {
//     std::vector<SharedPtr<Command>> command;
//     std::vector<std::shared_ptr<Tensor>> extras;
//     bool hasWrap = false;
// };
static ErrorCode _createExecutions(Schedule::PipelineInfo& mInfo) {
  // mInfo.first 类型为 BackendCache
    // mInfo.first.cache.first 类型为 std::shared_ptr<Backend>,即主动创建的 VulkanBackend
    auto& mBackend = mInfo.first.cache.first;
    // mInfo.first.cache.second 类型为 std::shared_ptr<Backend>,即后备 CPUBackend
    auto& mBackupBackend = mInfo.first.cache.second;
    // mInfo.second 类型为 std::vector<OpCacheInfo>
    for (auto& info : mInfo.second) {
        auto& buffer = info.executeBuffer;
        // MNN_PRINT("before resize, mInfo.second size:%lu, command size:%lu,op type:%s, op name:%s\n", mInfo.second.size(), buffer.command.size(), EnumNameOpType(info.op->type()), info.op->name()->c_str());
        // buffer 类型为 CommandBuffer
        // buffer.command 类型为 std::vector<SharedPtr<Command>>
        for (auto& iterP : buffer.command) {
            auto& iter = *iterP;
            // Create exe
            // Find Cache
            // 先从缓存中找,没有则创建 execution
            bool cached    = false;
            if (nullptr == iter.execution) {
                /** Cache origin execution for fast resize*/
                auto exeIter = info.executionCache.find(iter.op);
                if (exeIter != info.executionCache.end()) {
                    iter.execution = exeIter->second;
                    cached         = true;
                }
            }
            if (nullptr == iter.execution) {
              // 先使用指定的 Backend(如 CPUBackend )创建
              // iter 类型为 Command
              // iter.execution 类型为 std::shared_ptr<Execution> execution;
                iter.execution.reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.op));
            }
            if (nullptr == iter.execution) {
                // Try Backup
                // 没创建成功则使用后备 Backend(如 VulkanBackend )创建
                iter.execution.reset(mBackupBackend->onCreate(iter.inputs, iter.outputs, iter.op));
                if (nullptr == iter.execution) {
                    if (mInfo.first.reportError) {
                        MNN_ERROR("Create execution error : %d\n", iter.op->type());
                    }
                    return NOT_SUPPORT;
                }
            }
            // invalid means memory alloc failed
            if (!iter.execution->valid()) {
                iter.execution = nullptr;
                iter.execution = nullptr;
                return OUT_OF_MEMORY;
            }
            if ((!cached) && iter.buffer == nullptr && (iter.op->type() != OpType_Raster) && (iter.op->type() != OpType_BinaryOp)) {
              // info 类型为 OpCacheInfo
              // info.executionCache 类型为 std::map<const Op*, std::shared_ptr<Execution>>
                info.executionCache.insert(std::make_pair(iter.op, iter.execution));
            }
        }
    }
    return NO_ERROR;
}

1.1.1.1.1.1 VulkanBackend::onCreate

    在函数 _createExecutions 中调用 VulkanBackend::onCreate 函数的代码如下:

  iter.execution.reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.op));

    由于 mBackendVulkanBackend(继承 Backend),所以实际调用的是 VulkanBackend::onCreate,其具体实现代码如下:

// source/backend/vulkan/image/backend/VulkanBackend.cpp
Execution* VulkanBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                   const MNN::Op* op) {
    auto creator = getCreatorMap();
    auto iter    = creator->find(op->type());
    std::string name = "";
    if (nullptr != op->name()) {
        name = op->name()->str();
    }
    if (iter == creator->end()) {
#ifdef MNN_OP_SUPPORT_LOG
        MNN_PRINT("Vulkan don't support %d, %s: %s\n", op->type(), EnumNameOpType(op->type()),
                name.c_str());
#endif
        return nullptr;
    }
    bool valid = true;
    for (int i=0; i<inputs.size(); ++i) {
        if (!OpCommonUtils::opNeedContent(op, i)) {
            continue;
        }
        auto t = inputs[i];
        auto inputDes = TensorUtils::getDescribe(t);
        if (inputDes->memoryType == Tensor::InsideDescribe::MEMORY_VIRTUAL) {
            for (auto& r : inputDes->regions) {
                if (!_supportImageSize(r.origin)) {
                    valid = false;
                    break;
                }
            }
            if (!valid) {
                break;
            }
        } else {
            if (!_supportImageSize(t)) {
                valid = false;
                break;
            }
        }
    }
    for (auto t : outputs) {
        if (!_supportImageSize(t)) {
            valid = false;
            break;
        }
    }
    if (!valid) {
#ifdef MNN_OP_SUPPORT_LOG
        MNN_ERROR("Vulkan don't support for %s, type=%s, Tensor not support\n", name.c_str(), EnumNameOpType(op->type()));
#endif
        return nullptr;
    }
    // iter->second 类型为 MNN::VulkanBackend::Creator *,即调用具体的创建算子计算实现
    auto originExecution = (VulkanBasicExecution*)iter->second->onCreate(inputs, outputs, op, this);
    if (nullptr == originExecution) {
#ifdef MNN_OP_SUPPORT_LOG
        MNN_ERROR("Vulkan don't support for %s, type=%s, Special case\n", name.c_str(), EnumNameOpType(op->type()));
#endif
        return nullptr;
    }
    if (mDirect) {
        return new VulkanBasicExecutionDirect(std::shared_ptr<VulkanBasicExecution>(originExecution));
    }
    return new VulkanBasicExecutionInDirect(std::shared_ptr<VulkanBasicExecution>(originExecution));
}

1.1.1.1.1.1.1 VulkanBackend::Creator::onCreate

    在函数 VulkanBackend::onCreate 中调用 VulkanBackend::Creator::onCreate 函数的代码如下:

  auto creator = getCreatorMap();
    auto iter    = creator->find(op->type());
    // iter->second 类型为 MNN::VulkanBackend::Creator *,即调用具体的创建算子计算实现
    auto originExecution = (VulkanBasicExecution*)iter->second->onCreate(inputs, outputs, op, this);

    备注:iter->second->onCreate 调用是个多态,实际运行中根据算子类型 opType ,调用不同的子类。其基类为 VulkanBackend::Creator 。

    其中一个实现类为 VulkanConvolutionCreator ,具体实现代码如下:

// source/backend/vulkan/image/execution/VulkanConvolution.cpp
class VulkanConvolutionCreator : public VulkanBackend::Creator {
public:
    virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op,
                                Backend* backend) const override {
        auto extra          = static_cast<VulkanBackend *>(backend);
        auto convReal       = op->main_as_Convolution2D();
        auto common         = convReal->common();
        auto outputCount    = common->outputCount();
        const int fh        = common->kernelY();
        const int fw        = common->kernelX();
        int srcCount        = 0;
        const float* source = nullptr;
        const float* biasPtr = nullptr;
        int weightSize = 0;
        std::shared_ptr<ConvolutionCommon::Int8Common> quanWeight;
        if (nullptr != op->main_as_Convolution2D()->quanParameter()) {
            auto quan = op->main_as_Convolution2D()->quanParameter();
            if (1 == quan->type() || 2 == quan->type()) {
                if (quan->has_scaleInt()) {
                    // Don't support IDST-int8 because of error
                    return nullptr;
                }
            }
            quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D(), backend, true);
            srcCount = quanWeight->weightFloat.size() / (outputCount * fh * fw);
            source   = quanWeight->weightFloat.get();
            weightSize = quanWeight->weightFloat.size();
        } else {
            if (nullptr != convReal->weight()) {
                srcCount = convReal->weight()->size() / (outputCount * fh * fw);
                source   = convReal->weight()->data();
                weightSize = convReal->weight()->size();
            } else {
                srcCount = convReal->common()->inputCount();
            }
        }
        if (nullptr != convReal->bias()) {
            biasPtr = convReal->bias()->data();
        }
        if (op->type() == OpType_Convolution) {
            if (inputs.size() > 1) {
                return nullptr;
            }
            auto convCommonParam = op->main_as_Convolution2D()->common();
            const int group      = convCommonParam->group();
            if (1 == group) {
                return VulkanConvolutionImpl::create(extra, common, inputs, outputs[0], source,
                                                     biasPtr, srcCount, outputCount);

            } else {
                return nullptr;
            }
        }
        return new VulkanConvolutionDepthwise(source, weightSize, op, backend);
    }
};

static bool gResistor = []() {
    VulkanBackend::addCreator(OpType_Convolution, new VulkanConvolutionCreator);
    VulkanBackend::addCreator(OpType_ConvolutionDepthwise, new VulkanConvolutionCreator);
    return true;
}();

1.1.1.1.1.1.2 VulkanBasicExecution

    VulkanBasicExecution 为 Vulkan 算子基类,其具体实现如下:

// source/backend/vulkan/image/execution/VulkanBasicExecution.hpp
class VulkanBasicExecution {
public:
    VulkanBasicExecution(Backend *bn) : mBackend(bn) {
        //Do nothing
    }
    virtual ~VulkanBasicExecution() = default;

    virtual ErrorCode onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                               const VulkanCommandPool::Buffer *cmdBuffer) = 0;

    Backend* backend() {
        return mBackend;
    }
private:
    Backend* mBackend;
};

1.1.1.1.1.1.3 Vulkan 算子执行实例注册

    VulkanBackend::onCreate 函数中有个 gCreator 成员,其缓存了所有的 Vulkan 算子执行创建实例 VulkanBackend::Creator,其初始化分布在各个 VulkanBackend::Creator 实例代码中,采用静态初始化执行的方式进行注册。

// source/backend/vulkan/image/execution/VulkanConvolution.cpp
static bool gResistor = []() {
    VulkanBackend::addCreator(OpType_Convolution, new VulkanConvolutionCreator);
    VulkanBackend::addCreator(OpType_ConvolutionDepthwise, new VulkanConvolutionCreator);
    return true;
}();

  VulkanBackend::addCreator 实现如下:

// source/backend/vulkan/image/backend/VulkanBackend.cpp
bool VulkanBackend::addCreator(OpType t, Creator* c) {
    auto allKind = getCreatorMap();
    allKind->insert(std::make_pair(t, c));
    return true;
}

static std::map<OpType, VulkanBackend::Creator*>* gCreator = nullptr;

// Creator
static inline std::map<OpType, VulkanBackend::Creator*>* getCreatorMap() {
    if (nullptr == gCreator) {
        gCreator = new std::map<OpType, VulkanBackend::Creator*>();
    }
    return gCreator;
}

添加Vulkan实现

  1. 添加Shader
    source/backend/vulkan/execution/glsl目录下添加具体的shader(*.comp)。若输入内存布局为NC4HW4,则按image实现,否则采用buffer实现。可以参考目录下已有实现。然后,执行makeshader.py脚本编译Shader。
  2. 实现类声明
    在目录source/backend/vulkan/execution/下添加VulkanMyCustomOp.hppVulkanMyCustomOp.cpp
class VulkanMyCustomOp : public VulkanBasicExecution {
public:
    VulkanMyCustomOp(const Op* op, Backend* bn);
    virtual ~VulkanMyCustomOp();
    ErrorCode onEncode(const std::vector<Tensor*>& inputs, 
                       const std::vector<Tensor*>& outputs,
                       const VulkanCommandPool::Buffer* cmdBuffer) override;
private:
    // GPU Shader所需的参数
    std::shared_ptr<VulkanBuffer> mConstBuffer;
    // Pipeline
    const VulkanPipeline* mPipeline;
    // Layout Descriptor Set
    std::shared_ptr<VulkanPipeline::DescriptorSet> mDescriptorSet;
};
  1. 实现
    实现函数onEncode,首先需要做内存布局检查:若为NC4HW4,则Shader用image实现,否则用buffer。执行完毕返回NO_ERROR。
  2. 注册实现类
class VulkanMyCustomOpCreator : public VulkanBackend::Creator {
public:
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, 
                                const MNN::Op* op,
                                Backend* backend) const override {
        return new VulkanMyCustomOp(op, backend);
    }
};
static bool gResistor = []() {
    VulkanBackend::addCreator(OpType_MyCustomOp, new VulkanMyCustomOpCreator);
    return true;
}();

添加Metal实现

  1. 添加Shader
    source/backend/Metal目录下添加MetalMyCustomOp.metal,并添加进Xcode工程。metal可以参考目录下已有实现。
  2. 实现类声明

source/backend/Metal目录下添加MetalMyCustomOp.hppMetalMyCustomOp.cpp,并添加进Xcode工程:

 class MetalMyCustomOp : public Execution {
public:
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, 
                               const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, 
                                const std::vector<Tensor *> &outputs) override;
};
  1. 实现onResizeonExecute
    不同于CPU Tensor将数据存储在host指针中,Metal数据指针存放在deviceId中,deviceId上存储的是id<MTLBuffer>
auto buffer = (__bridge id<MTLBuffer>)(void *)tensor->deviceId();

Metal Op的特定参数等可以通过id<MTLBuffer>存储。buffer数据类型可以与tensor不同,buffer甚至可以混合多种数据类型,只需保证创建时指定了正确的长度即可。例如:

auto buffer = [context newDeviceBuffer:2 * sizeof(int) + 2 * sizeof(__fp16) access:CPUWriteOnly];
((__fp16 *)buffer.contents)[0] = mAlpha / mLocalSize;  // alpha
((__fp16 *)buffer.contents)[1] = mBeta;                // beta
((int *)buffer.contents)[1] = mLocalSize;              // local size
((int *)buffer.contents)[2] = inputs[0]->channel();    // channel

在创建buffer时,需要指定访问控制权限。目前共有三种权限:

  • CPUReadWrite,数据在CPU/GPU间共享存储,一般用于device buffer;
  • CPUWriteOnly,数据通过CPU写入后不再读取,一般用于参数buffer;
  • CPUTransparent,数据只在GPU中,一般用于heap buffer;

MNNMetalContext在创建buffer上,有两套相近的接口,区别只在数据的生命周期上:

  • device占用的内存在单次推理过程中都不会被复用;
  • 而heap占用的内存,在调用-[MNNMetalContext releaseHeapBuffer:]之后,可以被其他Op复用;

一般而言,heap只会与CPUTransparent一起使用。heap实际只在iOS 10+上有效,iOS 9-上会回退到device上。

使用Metal时,如非特殊情况,禁止自行创建device和library。加载library、编译function都是耗时行为,MNNMetalContext上做了必要的缓存优化。通过context执行Metal的示例如下:

auto context   = (__bridge MNNMetalContext *)backend->context();
auto kernel    = /* metal kernel name NSString */;
auto encoder   = [context encoder];
auto bandwidth = [context load:kernel encoder:encoder];
/* encoder set buffer(s)/sampler(s) */
[context dispatchEncoder:encoder 
           threads:{x, y, z}
      maxThreadsPerGroup:maxThreadsPerThreadgroup]; // recommended way to dispatch
[encoder endEncoding];
  1. 注册实现类
class MetalMyCustomOpCreator : public MetalBackend::Creator {
public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, 
                                const MNN::Op *op, Backend *backend) const {
        return new MetalMyCustomOp(backend);
    }
};
REGISTER_METAL_OP_CREATOR(MetalMyCustomOpCreator, OpType_MyCustomOp);


添加注册代码后,重新运行一下 CMake ,自动变更注册文件

添加OpenCL实现

  1. 添加Kernel

source/backend/opencl/execution/cl目录添加具体的kernel(*.cl)。目前feature map均使用image2d实现。可以参考目录下已有实现。然后执行opencl_codegen.py来生成kernel映射。

  1. 实现类声明

在目录source/backend/opencl/execution/下添加MyCustomOp.hMyCustomOp.cpp

template <typename T>
class MyCustomOp : public Execution {
public:
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, 
                               const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, 
                                const std::vector<Tensor *> &outputs) override;
};
  1. 实现
    实现函数onResize(可选)、onExecute。执行完毕返回NO_ERROR。
  2. 注册实现类
OpenCLCreatorRegister<TypedCreator<MyCustomOp<cl_data_t>>> __my_custom_op(OpType_MyCustomOp);

添加OpenGL实现

  1. 添加Shader
    source/backend/opengl/glsl下添加具体的shader(*.glsl),不用加文件头,feature map 均采用image3d表示。可以参考目录下已有实现。而后,在source/backend/opengl目录下执行makeshader.py
  2. 添加Executor

source/backend/opengl/execution/目录下添加GLMyCustomOp.hGLMyCustomOp.cpp

class GLMyCustomOp : public Execution {
public:
    GLMyCustomOp(const std::vector<Tensor *> &inputs, const Op *op, Backend *bn);
    virtual ~GLMyCustomOp();
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, 
                                const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, 
                               const std::vector<Tensor *> &outputs) override;

private:
    std::shared_ptr<GLProgram> mProgram;
};
  1. 实现
    实现函数onResize(可选)、onExecute。执行完毕返回NO_ERROR。
  2. 注册实现类-
GLCreatorRegister<TypedCreator<GLMyCustomOp>> __my_custom_op(OpType_MyCustomOp);

1.1.1.1.1.1.4 VulkanBasicExecutionDirect

// source/backend/vulkan/image/execution/VulkanBasicExecution.hpp
class VulkanBasicExecutionDirect : public Execution {
public:
    VulkanBasicExecutionDirect(std::shared_ptr<VulkanBasicExecution> encoder);
    virtual ~ VulkanBasicExecutionDirect() = default;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;

private:
    std::shared_ptr<VulkanBasicExecution> mEncoder;
    std::shared_ptr<VulkanCommandPool::Buffer> mCmdBuffer;
};

1.1.1.1.1.1.5 VulkanBasicExecutionDirect::VulkanBasicExecutionDirect

// source/backend/vulkan/image/execution/VulkanBasicExecution.cpp
VulkanBasicExecutionDirect::VulkanBasicExecutionDirect(std::shared_ptr<VulkanBasicExecution> encoder) : Execution(encoder->backend()) {
    mEncoder = encoder;
    auto extra = static_cast<VulkanBackend *>(encoder->backend());
    mCmdBuffer.reset(const_cast<VulkanCommandPool::Buffer *>(extra->getPool().allocBuffer()));
}

1.1.1.1.1.1.6 VulkanBasicExecutionInDirect

// source/backend/vulkan/image/execution/VulkanBasicExecution.hpp
class VulkanBasicExecutionInDirect : public Execution {
public:
    VulkanBasicExecutionInDirect(std::shared_ptr<VulkanBasicExecution> encoder);
    virtual ~ VulkanBasicExecutionInDirect() = default;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
        return NO_ERROR;
    }
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    
private:
    std::shared_ptr<VulkanBasicExecution> mEncoder;
};

1.1.1.1.1.2 CPUBackend::onCreate

    在函数 _createExecutions 中先调用 VulkanBackend::onCreate 创建算子执行器,没创建成功则采用后备 CPUBackend::onCreate 进行创建。

            if (nullptr == iter.execution) {
              // 先使用指定的 Backend(如 CPUBackend )创建
              // iter 类型为 Command
              // iter.execution 类型为 std::shared_ptr<Execution> execution;
                iter.execution.reset(mBackend->onCreate(iter.inputs, iter.outputs, iter.op));
            }
            if (nullptr == iter.execution) {
                // Try Backup
                // 没创建成功则使用后备 Backend(如 VulkanBackend )创建
                iter.execution.reset(mBackupBackend->onCreate(iter.inputs, iter.outputs, iter.op));
                if (nullptr == iter.execution) {
                    if (mInfo.first.reportError) {
                        MNN_ERROR("Create execution error : %d\n", iter.op->type());
                    }
                    return NOT_SUPPORT;
                }
            }

    关于 CPUBackend::onCreate 的分析见传送门

1.1.1.1.2 Backend::onResizeBegin

    在函数 Pipeline::allocMemory 中调用 VulkanBackend::onResizeBeginCPUBackend::onResizeBegin 函数的代码如下:

    mBackend->onResizeBegin();
    mBackupBackend->onResizeBegin();

    onResizeBegin 函数是个虚函数,由于 mBackendVulkanBackend(继承 Backend) ,所以实际调用的是 VulkanBackend::onResizeBegin,其具体实现代码如下:

// source/backend/vulkan/image/backend/VulkanBackend.cpp
void VulkanBackend::onResizeBegin() {
  // 开始录制 mInitBuffer
    mInitBuffer->begin(0);
    if (!mDirect) {
        mCmdBuffer->begin(0);
    }
}

    mBackupBackendCPUBackend(继承 Backend),其具体实现见传送门

1.1.1.1.2.1 VulkanCommandPool::Buffer::begin

    在函数 VulkanBackend::onResizeBegin 中调用 VulkanCommandPool::Buffer::begin 函数的代码如下:

mInitBuffer->begin(0);

  其实现代码如下:

void VulkanCommandPool::Buffer::begin(VkCommandBufferUsageFlags flag) const {
    VkCommandBufferBeginInfo cmdBufferBeginInfo{
        /* .sType            = */ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
        /* .pNext            = */ nullptr,
        /* .flags            = */ flag,
        /* .pInheritanceInfo = */ nullptr,
    };
    vkResetCommandBuffer(mBuffer, 0);
    CALL_VK(vkBeginCommandBuffer(mBuffer, &cmdBufferBeginInfo));
}

1.1.1.1.3 _allocTensor

    在函数 Pipeline::allocMemory 中调用 _allocTensor 函数的代码如下:

                for (auto t : iter.workInputs) {
                    auto allocRes = _allocTensor(t, curBackend, mOutputStatic);
                    if (!allocRes) {
                        return OUT_OF_MEMORY;
                    }
                }

  其实现代码如下:

// source/core/Pipeline.cpp
static bool _allocTensor(Tensor* t, Backend* curBackend, bool outputStatic) {
    auto memoryType = _getTensorStorageType(t, outputStatic);
    auto bn         = TensorUtils::getDescribe(t)->getBackend();
    auto des = TensorUtils::getDescribe(t);
    if (nullptr == des->mem.get()) {
        MNN_ASSERT(des->memoryType != Tensor::InsideDescribe::MEMORY_VIRTUAL);
        TensorUtils::setLinearLayout(t);
        auto res     = curBackend->onAcquireBuffer(t, memoryType);
        return res;
    }
    return true;
}

1.1.1.1.3.1 Backend::onAcquireBuffer

    在函数 _allocTensor 中调用 Backend::onAcquireBuffer 函数的代码如下:

  auto res     = curBackend->onAcquireBuffer(t, memoryType);

    onAcquireBuffer 只存在 Backend 基类中,其主要用来为张量 tensor 分配内存,具体实现代码如下:

bool Backend::onAcquireBuffer(const Tensor* tensor, StorageType storageType) {
    auto mem = this->onAcquire(tensor, storageType);
    if (nullptr == mem) {
        return false;
    }
    if (mem == TensorUtils::getDescribe(tensor)->mem.get()) {
        return true;
    }
    TensorUtils::getDescribe(tensor)->mem.reset(mem);
    return true;
}

    onAcquireBuffer 函数中调用 onAcquire 函数,这是个虚函数,由于传入的 curBackendVulkanBackend(继承 Backend),所以实际调用的是 VulkanBackend::onAcquire,其具体实现代码如下:

// source/backend/vulkan/image/backend/VulkanBackend.cpp
Backend::MemObj* VulkanBackend::onAcquire(const Tensor* tensor, StorageType storageType) {
    //FUNC_PRINT_ALL(tensor, p);

    auto MTensor     = const_cast<Tensor*>(tensor);
    auto format = _getFormat(tensor->getType());
    if (Backend::STATIC == storageType) {
        auto newBuffer           = std::make_shared<VulkanTensor>(MTensor, format, getMemoryPool(), device().proty().limits);
        MTensor->buffer().device = (uint64_t)(newBuffer.get());
        return new VulkanMemRelease(newBuffer);
    }
    bool separate  = storageType == Backend::DYNAMIC_SEPERATE;
    auto newBuffer = std::make_shared<VulkanTensor>(MTensor, format, getDynamicMemoryPool(), device().proty().limits, separate);
    MTensor->buffer().device = (uint64_t)(newBuffer.get());
    mAllBuffers.insert(std::make_pair(MTensor->buffer().device, newBuffer));
    return new VulkanMemRelease(newBuffer);;
}

1.1.1.1.4 Execution::onResize

    在函数 Pipeline::allocMemory 中调用 Execution::onResize 函数的代码如下:

  // iter 类型为 Command
  // iter.execution 类型为 std::shared_ptr<Execution>
    auto code = iter.execution->onResize(iter.workInputs, iter.workOutputs);

    onResize 函数是个虚函数, iter.execution 创建逻辑见 VulkanBackend::onCreateiter.execution->onResize 调用是个多态,其基类为 ExecutionVulkan 创建的算子执行实例都是 VulkanBasicExecution 的子类,而其会采用 VulkanBasicExecutionDirect 或者

VulkanBasicExecutionInDirect 进行包装, 我们选择一个实例 VulkanBasicExecutionDirect 进行分析,其具体实现代码如下:

// source/backend/vulkan/image/execution/VulkanBasicExecution.cpp
ErrorCode VulkanBasicExecutionDirect::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
  // VulkanBackend 中的成员变量
  // std::shared_ptr<VulkanCommandPool::Buffer> mCmdBuffer;
    // std::shared_ptr<VulkanCommandPool::Buffer> mInitBuffer;
    // mutable std::vector<VkCommandBuffer> mCmdBuffers;
    auto initCmdBuffer = static_cast<VulkanBackend*>(backend())->getInitCommandBuffer();
    _initLayout(inputs, outputs, initCmdBuffer);
    // 开始录制指令
    mCmdBuffer->begin(0);
    // 录制指令
    auto code = mEncoder->onEncode(inputs, outputs, mCmdBuffer.get());
    for (auto output : outputs) {
        auto vkTensor = reinterpret_cast<VulkanTensor*>(output->deviceId());
        for (int i=0; i<vkTensor->imageSize(); ++i) {
            auto img = vkTensor->image(i);
            img->barrierRead(mCmdBuffer->get());
        }
    }
    _postTreat(outputs, mCmdBuffer.get());
    // 结束录制指令
    mCmdBuffer->end();
#ifdef MNN_VULKAN_DEBUG
#ifdef MNN_VULKAN_DEBUG_EAGER
    static_cast<VulkanBackend*>(backend())->onExecuteBegin();
    static_cast<VulkanBackend*>(backend())->pushCommand(mCmdBuffer->get());
    static_cast<VulkanBackend*>(backend())->onExecuteEnd();
#endif
#endif
    return code;
}

    录制命令用来进行算子运算,基于 Execution 基类。

1.1.1.1.4.1 VulkanBasicExecution::onEncode

    在函数 VulkanBasicExecutionDirect::onResize 中调用 VulkanBasicExecution::onEncode 函数的代码如下:

    auto code = mEncoder->onEncode(inputs, outputs, mCmdBuffer.get());

    onEncode 函数是个虚函数,mEncoder 的基类为 VulkanBasicExecution,我们选择一个实例 VulkanConvolutionIm2Col 进行分析,其具体实现代码如下:

// source/backend/vulkan/image/execution/VulkanConvolutionImpl.cpp
class VulkanConvolutionIm2Col : public VulkanBasicExecution {
    virtual ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                               const VulkanCommandPool::Buffer* cmdBuffer) override {
        auto src         = inputs[0];
        auto dst         = outputs[0];
        const int icDiv4 = UP_DIV(src->channel(), 4);
        const int ocDiv4 = UP_DIV(dst->channel(), 4);
        auto vkBn = (VulkanBackend*)backend();
        int limit = vkBn->proty().limits.maxImageDimension2D * 4;
#ifdef VULKAN_IM2COL_GEMM_UNIT
        limit = VULKAN_IM2COL_GEMM_UNIT;
#endif
        if (limit < dst->width()) {
            MNN_ERROR("Don't support width too large feature: %d x %d, limit = %d\n", dst->width(), dst->height(), limit);
            return NOT_SUPPORT;
        }
        int batchLoopNumber = 1;
        int heightLoopNumber = 1;
        int unitHeight = dst->height();
        int unitBatch = dst->batch();
        auto area = dst->width() * dst->height();
        if (limit < area) {
            batchLoopNumber = dst->batch();
            unitBatch = 1;
            unitHeight = limit / dst->width();
            heightLoopNumber = UP_DIV(dst->height(), unitHeight);
        } else if (limit < area * dst->batch()) {
            unitBatch = limit / area;
            batchLoopNumber = UP_DIV(dst->batch(), unitBatch);
        }
        int loopNumber = batchLoopNumber * heightLoopNumber;
        mConvParams.resize(loopNumber);
        mMultilers.resize(loopNumber);
        mIm2ColSet.resize(loopNumber);
        mCol2ImSet.resize(loopNumber);
        reinterpret_cast<VulkanTensor*>(src->deviceId())->image()->barrierRead(cmdBuffer->get());
        reinterpret_cast<VulkanTensor*>(dst->deviceId())->image()->barrierWrite(cmdBuffer->get());

        for (int i=0; i<batchLoopNumber; ++i) {
            int batchOffset = i * unitBatch;
            int currentBatch = dst->batch() - batchOffset;
            if (currentBatch > unitBatch) {
                currentBatch = unitBatch;
            }
            for (int j=0; j<heightLoopNumber; ++j) {
                int heightOffset = j * unitHeight;
                int currentHeight = dst->height() - heightOffset;
                if (currentHeight > unitHeight) {
                    currentHeight = unitHeight;
                }
                auto index = i * heightLoopNumber + j;
                auto totalNumberInput = currentBatch * icDiv4 * dst->width() * currentHeight;
                auto totalNumberOutput = currentBatch * ocDiv4 * dst->width() * currentHeight;
                mConvParams[index] = std::make_shared<VulkanBuffer>(vkBn->getMemoryPool(), false,
                                                        sizeof(VulkanConvolutionCommon::ConvolutionParameter), nullptr,
                                                        VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
                {
                    auto convCons = reinterpret_cast<VulkanConvolutionCommon::ConvolutionParameter*>(mConvParams[index]->map());
                    VulkanConvolutionCommon::writeParameter(convCons, mConvCommonOption, src, dst);
                    convCons->offset[0] = batchOffset;
                    convCons->offset[1] = heightOffset;
                    convCons->outputSize[3] = currentBatch;
                    convCons->outputSize[1] = currentHeight;
                    mConvParams[index]->unmap();
                }
                mIm2ColSet[index].reset(mIm2Col->createSet());
                mCol2ImSet[index].reset(mCol2Im->createSet());
                mMultilers[index] = mMultiCreator();
                mMultilers[index]->prepare(static_cast<VulkanBackend*>(backend())->getInitCommandBuffer(), dst->width() * currentHeight * currentBatch);
                auto mMultiler = mMultilers[index].get();
                if (true) {
                    auto colImage = mMultiler->source();
                    // Barrier
                    mIm2ColSet[index]->writeImage(colImage->view(), mSampler->get(), VK_IMAGE_LAYOUT_GENERAL, 0);
                    mIm2ColSet[index]->writeImage((reinterpret_cast<VulkanTensor*>(src->deviceId()))->image()->view(), mSampler->get(),
                                        VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 1);
                    mIm2ColSet[index]->writeBuffer(mConvParams[index]->buffer(), 2, mConvParams[index]->size());
                    mIm2Col->bind(cmdBuffer->get(), mIm2ColSet[index]->get());

                    colImage->barrierWrite(cmdBuffer->get());
                    vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalNumberInput, VulkanConvolutionCommon::gImage2ColLocal),
                                1, 1);
                }
                mMultilers[index]->compute(cmdBuffer);
                if (true) {
                    auto dstImage = mMultiler->dest();
                    mCol2ImSet[index]->writeImage(dstImage->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 0);
                    mCol2ImSet[index]->writeImage((reinterpret_cast<VulkanTensor*>(dst->deviceId()))->image()->view(), mSampler->get(),
                                        VK_IMAGE_LAYOUT_GENERAL, 1);

                    mCol2ImSet[index]->writeImage(mBias->view(), mSampler->get(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 2);
                    mCol2ImSet[index]->writeBuffer(mConvParams[index]->buffer(), 3, mConvParams[index]->size());
                    mCol2Im->bind(cmdBuffer->get(), mCol2ImSet[index]->get());
                    
                    dstImage->barrierRead(cmdBuffer->get());
                    mBias->barrierRead(cmdBuffer->get());
                    vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalNumberOutput, VulkanConvolutionCommon::gImage2ColLocal),
                                1, 1);
                }
            }
        }
        return NO_ERROR;
    }
}

1.1.1.1.4.2 VulkanCommandPool 与 Buffer

// source/backend/vulkan/component/VulkanCommandPool.hpp
class VulkanCommandPool : public NonCopyable {
public:
    VulkanCommandPool(const VulkanDevice& dev);
    virtual ~VulkanCommandPool();

    class Buffer : public NonCopyable {
    public:
        Buffer(const VulkanCommandPool* pool);
        virtual ~Buffer();

        VkCommandBuffer get() const {
            return mBuffer;
        }

        void begin(VkCommandBufferUsageFlags flags) const;
        void end() const;
        enum BarrierType {
            READ_WRITE = 0,
            WRITE_WRITE,
            WRITE_READ,
        };
        void barrierSource(VkBuffer source, size_t start, size_t end, BarrierType type = READ_WRITE) const;
        void barrierSource(std::tuple<VkBuffer, VkDeviceSize, VkDeviceSize>, BarrierType type = READ_WRITE) const;
    private:
        VkCommandBuffer mBuffer;
        const VulkanCommandPool* mPool;
    };

    VulkanCommandPool::Buffer* allocBuffer() const;

    VkCommandPool pool() const {
        return mPool;
    }

    void submitAndWait(VkCommandBuffer buffer) const;

private:
    const VulkanDevice& mDevice;
    VkCommandPool mPool;
    mutable std::vector<VkCommandBuffer> mFreeBuffers;
};
}

// source/backend/vulkan/component/VulkanCommandPool.cpp
void VulkanCommandPool::Buffer::begin(VkCommandBufferUsageFlags flag) const {
    VkCommandBufferBeginInfo cmdBufferBeginInfo{
        /* .sType            = */ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
        /* .pNext            = */ nullptr,
        /* .flags            = */ flag,
        /* .pInheritanceInfo = */ nullptr,
    };
    vkResetCommandBuffer(mBuffer, 0);
    CALL_VK(vkBeginCommandBuffer(mBuffer, &cmdBufferBeginInfo));
}
void VulkanCommandPool::Buffer::end() const {
    CALL_VK(vkEndCommandBuffer(mBuffer));
}

1.1.1.1.5 Backend::onResizeEnd

    在函数 Pipeline::allocMemory 中调用 VulkanBackend::onResizeEndCPUBackend::onResizeEnd 函数的代码如下:

    auto code = mBackend->onResizeEnd();
    if (code != NO_ERROR) {
        return code;
    }
    code = mBackupBackend->onResizeEnd();

    onResizeEnd 函数是个虚函数,由于 mBackendVulkanBackend(继承 Backend) ,所以实际调用的是 VulkanBackend::onResizeEnd,其具体实现代码如下:

// source/backend/vulkan/image/backend/VulkanBackend.cpp
ErrorCode VulkanBackend::onResizeEnd() {
    if (!mDirect) {
        mCmdBuffer->end();
    }
    // 结束 mInitBuffer 指令录制
    mInitBuffer->end();
    mCmdBuffers.emplace_back(mInitBuffer->get());
    // 提交刚录制的 mInitBuffer  即 mCmdBuffers
    _finish();
    return NO_ERROR;
}

mBackupBackendCPUBackend(继承 Backend),其具体实现见传送门

1.1.1.1.5.1 _finish

// source/backend/vulkan/image/backend/VulkanBackend.cpp
void VulkanBackend::_finish() const {
    if (mCmdBuffers.empty()) {
        return;
    }
    VkSubmitInfo submit_info = {/* .sType                = */ VK_STRUCTURE_TYPE_SUBMIT_INFO,
                                /* .pNext                = */ nullptr,
                                /* .waitSemaphoreCount   = */ 0,
                                /* .pWaitSemaphores      = */ nullptr,
                                /* .pWaitDstStageMask    = */ nullptr,
                                /* .commandBufferCount   = */ (uint32_t)mCmdBuffers.size(),
                                /* .pCommandBuffers      = */ mCmdBuffers.data(),
                                /* .signalSemaphoreCount = */ 0,
                                /* .pSignalSemaphores    = */ nullptr};
    auto fenceReal           = mFence->get();
    mFence->reset();
    CALL_VK(vkQueueSubmit(device().acquireDefaultDevQueue(), 1, &submit_info, fenceReal));

    auto res = mFence->wait();
    MNN_VK_CHECK(res);
    mCmdBuffers.clear();
}

各算子的指令缓存执行见 传送门


目录
相关文章
|
3月前
|
缓存
MNN 执行推理(九)
MNN 执行推理(九)
48 1
|
3月前
|
缓存
MNN Session 之 CPU 算子(七)
MNN Session 之 CPU 算子(七)
38 1
|
3月前
MNN Session 之几何计算(六)
MNN Session 之几何计算(六)
29 6
|
3月前
|
调度
MNN createSession 之 Schedule(三)
MNN createSession 之 Schedule(三)
36 5
|
3月前
MNN Session 之维度计算(五)
MNN Session 之维度计算(五)
25 3
|
5月前
|
机器学习/深度学习 PyTorch 算法框架/工具
【从零开始学习深度学习】16. Pytorch中神经网络模型的构造方法:Module、Sequential、ModuleList、ModuleDict的区别
【从零开始学习深度学习】16. Pytorch中神经网络模型的构造方法:Module、Sequential、ModuleList、ModuleDict的区别
|
6月前
|
存储 机器学习/深度学习 缓存
LLM 加速技巧:Muti Query Attention
MQA 是 19 年提出的一种新的 Attention 机制,其能够在保证模型效果的同时加快 decoder 生成 token 的速度。在大语言模型时代被广泛使用,很多LLM都采用了MQA,如Falcon、PaLM、StarCoder等。
140 0
|
机器学习/深度学习 人工智能 并行计算
Pytorch Lightning使用:【LightningModule、LightningDataModule、Trainer、ModelCheckpoint】
Pytorch Lightning使用:【LightningModule、LightningDataModule、Trainer、ModelCheckpoint】
576 0
|
机器学习/深度学习 PyTorch 算法框架/工具
【多任务学习】Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry and Semantics
【多任务学习】Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry and Semantics
895 0
【多任务学习】Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry and Semantics
|
监控 自动驾驶 数据可视化
超实时语义分割 | DWR-Seg超越STDC-1/2、BiSeNet v1/v2,1080ti单卡320+FPS(一)
超实时语义分割 | DWR-Seg超越STDC-1/2、BiSeNet v1/v2,1080ti单卡320+FPS(一)
197 0
下一篇
无影云桌面