MNN Session 之维度计算（五）-阿里云开发者社区

1、createSession

依据 ScheduleConfig 和 RuntimeInfo 创建会话。

// source/core/Interpreter.cpp
Session* Interpreter::createSession(const ScheduleConfig& config, const RuntimeInfo& runtime) {
    return createMultiPathSession({config}, runtime);
}

1.1 createMultiPathSession

createMultiPathSession 完整代码

// source/core/Interpreter.cpp
Session* Interpreter::createMultiPathSession(const std::vector<ScheduleConfig>& configs, const RuntimeInfo& runtime) {
  // ...
    auto result = newSession.get();
    auto validForResize = info.validForResize;
    if (validForResize && mNet->modes.inputMode == Session_Input_Inside && mNet->modes.resizeMode == Session_Resize_Direct) {
        result->resize();
    }
    // ...
    return result;
}

1.1.1 Session::resize

// source/core/Session.cpp
ErrorCode Session::resize() {
#ifdef LOG_VERBOSE
    for (auto& iter : mInputs) {
        auto& inputTensor = iter.second;
        MNN_PRINT("before resize, input name:%s, ptr:%p, hostPtr:%p,  shape:", iter.first.c_str(), inputTensor, inputTensor->host<void>());
        inputTensor->printShape();
        MNN_PRINT("\n");
    }
#endif
    bool permitCodegen = mCodegenMode == Interpreter::Session_Codegen_Enable;

    bool firstMalloc = false;
    if (mNeedResize) {
        bool debug = mCallBackMode == Interpreter::Session_Debug;
        // mPipelines 类型为 std::vector<std::shared_ptr<Pipeline>>
        for (auto& iter : mPipelines) {
            auto error = iter->encode(debug, permitCodegen);
            if (NO_ERROR != error) {
                return error;
            }
        }
        mNeedResize = false;
        mNeedMalloc = true;
        firstMalloc = true;
    }
    if (mNeedMalloc) {
        // Set needResize = true for easy for judge in runSession when error
        mNeedResize = true;
        // Turn Pipeline to Command Buffer and Malloc resource
        // TODO: Separate Schedule and Malloc
        bool forbidReplace = permitCodegen;
        if (mInfo.constReplaceBackend != nullptr) {
            forbidReplace = true;
        }
        for (auto& iter : mPipelines) {
            auto error = iter->allocMemory(firstMalloc, forbidReplace);
            if (NO_ERROR != error) {
                return error;
            }
        }
        if(mMemoryUsageMode == Interpreter::Session_Memory_Collect) {
            #ifdef LOG_VERBOSE
            float memory = 0.0f;
            #endif
            for (auto& iter : mRuntime.first) {
                iter.second->onGabageCollect(0);
                #ifdef LOG_VERBOSE
                memory += iter.second->onGetMemoryInMB();
                #endif
            }
            #ifdef LOG_VERBOSE
            FUNC_PRINT_ALL(memory, f);
            #endif
        }
        mNeedMalloc = false;
        mNeedResize = false;
    }

#ifdef LOG_VERBOSE
    MNN_PRINT("session after resize\n");
    for (auto& iter : mOutputs) {
        auto& outputTensor = iter.second;
        MNN_PRINT("output name:%s, ptr:%p,shape:", iter.first.c_str(), outputTensor);
        outputTensor->printShape();
        MNN_PRINT("\n");
    }
#endif
    return NO_ERROR;
}

1.1.1.1 Pipeline::encode

BackendCache、OpCacheInfo

// source/core/Pipeline.cpp
// typedef std::pair<BackendCache, std::vector<OpCacheInfo>> PipelineInfo;
//
//   struct BackendCache {
//      Backend::Info info;
//      BackendConfig config;
//      std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>> cache;
//      bool needComputeShape = true;
//      bool needComputeGeometry = true;
//      bool reportError = true;
//      std::map<Tensor*, TENSORCACHE> inputTensorCopyCache;
//  };
//
//    /** pipeline info */
//    struct OpCacheInfo {
//        /** op */
//        const Op* op;
//        /** input tensors */
//        std::vector<Tensor*> inputs;
//        /** output tensors */
//        std::vector<Tensor*> outputs;
//        /** schedule type*/
//        Schedule::Type type = Schedule::Type::SEPARATE;
//
//        /**Command buffer for cache*/
//        CommandBuffer cacheBuffer;
//
//        /**Command buffer for execute*/
//        CommandBuffer executeBuffer;
//        
//        std::map<const Op*, std::shared_ptr<Execution>> executionCache;
//    };
//
ErrorCode Pipeline::encode(bool supportDebug, bool permitCodegen) {
  // mInfo.first.cache 类型为 std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>>
  // mBackend 创建的后端如（VulkanBackend）
    auto& mBackend = mInfo.first.cache.first;
    // mBackupBackend 创建的后备（默认）后端如（CPUBackend）
    auto& mBackupBackend = mInfo.first.cache.second;
    // Static Model just copy info to command buffer
    // mInfo.first 类型为 BackendCache 
    if (!mInfo.first.needComputeGeometry) {
        for (int i=0; i<mInfo.second.size(); ++i) {
            auto& info = mInfo.second[i];
            SharedPtr<Command> cmd = new Command;
            cmd->op      = info.op;
            if (cmd->op->type() == OpType_Raster) {
                // Compability for Origin Static Model
                cmd->outputs  = info.outputs;
                if (TensorUtils::getDescribe(info.outputs[0])->regions.empty() && info.inputs.size() > 0 && TensorUtils::getDescribe(info.inputs[0])->regions.size() > 0) {
                    TensorUtils::getDescribe(info.outputs[0])->regions = std::move(TensorUtils::getDescribe(info.inputs[0])->regions);
                    TensorUtils::setRasterInputs(cmd.get());
                } else {
                    cmd->inputs  = info.inputs;
                }
            } else {
                cmd->inputs  = info.inputs;
                cmd->outputs = info.outputs;
            }
            info.executeBuffer.command = {cmd};
        }
    } else {
#ifndef MNN_BUILD_MINI
    // mContext 类型为 GeometryComputer::Context
        mContext.clear();
        /** Size Compute and compute Const Begin */
        auto res = GeometryComputerUtils::shapeComputeAndGeometryTransform(mInfo.second, mContext, mInfo.first.cache.second, mUseGeometry, false, permitCodegen);
        if (res != NO_ERROR) {
            return res;
        }
#endif
    }
    // Propagate Scale and insert new command
    if (mIsQuantModel && (mBackend->type() == MNN_FORWARD_CPU || mBackend->type() == MNN_FORWARD_CPU_EXTENSION || mBackend->type() == MNN_FORWARD_CUDA || mBackend->type() == MNN_FORWARD_NN || mBackend->type() == MNN_FORWARD_OPENCL)) {
        // get propagate map
        using PropagateMap = std::map<const MNN::Tensor*, std::set<const MNN::Tensor*>>;
        PropagateMap forwardMap, backwardMap;
        auto insertPropagateMap = [](PropagateMap& propagateMap, const Tensor* s, const Tensor* t) {
            if (propagateMap.find(s) == propagateMap.end()) {
                propagateMap[s] = std::set<const Tensor*>({t});
            } else {
                propagateMap[s].insert(t);
            }
        };
        std::set<OpType> propagateOpTypes = { OpType_Raster, OpType_ReLU, OpType_ReLU6, OpType_Pooling,
                                              OpType_Interp, OpType_CropAndResize, OpType_ROIPooling, OpType_Gather,
                                              OpType_GatherV2, OpType_GatherV2, OpType_ScatterNd};
        for (auto& info : mInfo.second) {
            auto& buffer = info.executeBuffer;
            for (const auto& cmdP : buffer.command) {
                auto& cmd = *cmdP;
                const auto type = cmd.op->type();
                const auto output = cmd.outputs[0];
                if (propagateOpTypes.find(type) != propagateOpTypes.end()) {
                    for (auto t : cmd.inputs) {
                        insertPropagateMap(forwardMap, t, output);
                        insertPropagateMap(backwardMap, output, t);
                    }
                }
            }
        }
        auto getStart = [&forwardMap, &backwardMap](bool forward) {
            auto& propagateMap = forward ? forwardMap : backwardMap;
            auto& antiMap = forward ? backwardMap : forwardMap;
            // delete N->1 Map of Op
            for (const auto& iter : antiMap) {
                if (iter.second.size() > 1) {
                    for (auto t : iter.second) {
                        auto res = propagateMap.find(t);
                        if (res != propagateMap.end()) {
                            propagateMap.erase(res);
                        }
                    }
                }
            }
            std::set<const Tensor*> root, leaf, start;
            for (const auto& iter : propagateMap) {
                root.insert(iter.first);
                for (auto t : iter.second) {
                    leaf.insert(t);
                }
            }
            std::set_difference(root.begin(), root.end(), leaf.begin(), leaf.end(), std::inserter(start, start.begin()));
            return start;
        };
        auto forwardStart = getStart(true);
        auto backwardStart = getStart(false);
        // propagate scale
        auto propagateScale = [](PropagateMap& propagateMap, std::set<const Tensor*>& start) {
            std::function<bool(const Tensor*)> scalePropagate = [&propagateMap, &scalePropagate](const Tensor* t) {
                if (TensorUtils::getDescribe(t)->quantAttr.get() == nullptr) {
                    return false;
                }
                if (propagateMap.find(t) == propagateMap.end()) {
                    return false;
                }
                bool change = false;
                for (auto x : propagateMap[t]) {
                    if (TensorUtils::getDescribe(x)->quantAttr != TensorUtils::getDescribe(t)->quantAttr) {
                        TensorUtils::getDescribe(x)->quantAttr = TensorUtils::getDescribe(t)->quantAttr;
                        change = true;
                    }
                    change |= scalePropagate(x);
                }
                return change;
            };
            bool change = false;
            for (auto t : start) {
                change |= scalePropagate(t);
            }
            return change;
        };
        for (int i = 0; i < 3 && (propagateScale(forwardMap, forwardStart) || propagateScale(backwardMap, backwardStart)); i++);
        
        // Insert cast
        std::map<const Tensor*, Tensor*> cachedCastTensor;
        for (auto& info : mInfo.second) {
            auto bufferCommand = std::move(info.executeBuffer.command);
            bool hasConvert = false;
            for (auto cmdP : bufferCommand) {
                auto& cmd = *cmdP;
                auto& outputs = cmd.outputs;
                auto& inputs = cmd.inputs;
                auto opType = cmd.op->type();
                // Check if need use quant op
                DataType runType = DataType_DT_FLOAT;
                bool useQuant = false;
                if (outputs.size() == 1) {
                    // Quant: output and all input has quantAttr and op support
                    if (TensorUtils::getDescribe(outputs[0])->quantAttr != nullptr) {
                        useQuant = _supportQuant(cmd.op, inputs, outputs, mBackend->type());
                    }
                    if (useQuant) {
                        for (auto t : inputs) {
                            if (TensorUtils::getDescribe(t)->quantAttr == nullptr) {
                                useQuant = false;
                                break;
                            }
                        }
                    }
                }
                if (useQuant) {
                    runType = DataType_DT_INT8;
                }
                
                for (auto o : outputs) {
                    auto quan = TensorUtils::getDescribe(o)->quantAttr;
                    if (nullptr != quan) {
                        TensorUtils::getDescribe(o)->type = runType;
                    }
                }
                auto makeCommand = [&cachedCastTensor, &info](CommandBuffer& cmdBuffer, Tensor* input, DataType runType) {
                    if (cachedCastTensor.find(input) != cachedCastTensor.end()) {
                        return cachedCastTensor[input];
                    }
                    std::shared_ptr<Tensor> wrapTensor(new Tensor);
                    TensorUtils::copyShape(input, wrapTensor.get(), true);
                    TensorUtils::setLinearLayout(wrapTensor.get());
                    auto des = TensorUtils::getDescribe(wrapTensor.get());
                    auto originDes = TensorUtils::getDescribe(input);
                    if (originDes->quantAttr != nullptr) {
                        des->quantAttr.reset(new QuantAttr);
                        *des->quantAttr = *originDes->quantAttr;
                        des->type = runType;
                    }
                    cmdBuffer.extras.emplace_back(wrapTensor);
                    SharedPtr<Command> command(new Command);
                    command->inputs = {input};
                    command->outputs = {wrapTensor.get()};
                    info.cacheBuffer.hasWrap = true;
                    flatbuffers::FlatBufferBuilder builder;
                    OpBuilder opB(builder);
                    if (runType == DataType_DT_INT8) {
                        opB.add_type(OpType_FloatToInt8);
                    } else {
                        opB.add_type(OpType_Int8ToFloat);
                    }
                    builder.Finish(opB.Finish());
                    command->buffer.reset(new BufferStorage);
                    command->buffer->storage = builder.ReleaseRaw(command->buffer->allocated_size, command->buffer->offset);
                    command->op = flatbuffers::GetRoot<Op>(command->buffer->buffer());
                    info.executeBuffer.command.emplace_back(std::move(command));
                    return wrapTensor.get();
                };
                // judge is it need CastWrap
                if (OpType_Raster == opType) {
                    for (int v=0; v<cmd.inputs.size(); ++v) {
                        auto input = cmd.inputs[v];
                        bool needCast = CPUBackend::getDataType(input) != runType;
                        if (needCast) {
                            cmd.inputs[v] = makeCommand(info.executeBuffer, input, runType);
                        }
                    }
                } else {
                    for (int i = 0; i < cmd.inputs.size(); i++) {
                        if (OpCommonUtils::opNeedContent(cmd.op, i) && inputs[i]->getType() != halide_type_of<int>()) {
                            bool needCast = CPUBackend::getDataType(inputs[i]) != runType;
                            if (needCast) {
                                cmd.inputs[i] = makeCommand(info.executeBuffer, inputs[i], runType);
                            }
                        }
                    }
                }
                info.executeBuffer.command.emplace_back(cmdP);
            }
        }
    }
    /** Prepare DebugInfo*/
    if (supportDebug) {
        mFlops = 0.0f;
        int totalIndex = 0;
        for (auto& info : mInfo.second) {
            auto& buffer = info.executeBuffer;
            int index = 0;
            for (auto& cmdP : buffer.command) {
                auto& cmd = *cmdP;
                cmd.info.reset(new UnitInfo);
                static_cast<UnitInfo*>(cmd.info.get())->setUp(cmd, index++, info.op, totalIndex++);
                mFlops += cmd.info->flops();
            }
        }
    }
#ifndef MNN_BUILD_MINI
    else {
        for (auto& info : mInfo.second) {
            auto& buffer = info.executeBuffer;
            for (auto& cmdP : buffer.command) {
                mFlops += SizeComputer::computeFlops(cmdP->op, cmdP->inputs, cmdP->outputs);
            }
        }
    }
#endif

    return NO_ERROR;
}

1.1.1.1.1 GeometryComputerUtils::shapeComputeAndGeometryTransform

OpCacheInfo

// source/geometry/GeometryComputerUtils.cpp
//    /** pipeline info */
//    struct OpCacheInfo {
//        /** op */
//        const Op* op;
//        /** input tensors */
//        std::vector<Tensor*> inputs;
//        /** output tensors */
//        std::vector<Tensor*> outputs;
//        /** schedule type*/
//        Schedule::Type type = Schedule::Type::SEPARATE;
//
//        /**Command buffer for cache*/
//        CommandBuffer cacheBuffer;
//
//        /**Command buffer for execute*/
//        CommandBuffer executeBuffer;
//        
//        std::map<const Op*, std::shared_ptr<Execution>> executionCache;
//    };
//
ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
    std::vector<Schedule::OpCacheInfo>& infos,
    GeometryComputer::Context& geoContext,
    std::shared_ptr<Backend> backupBackend,
    Runtime::CompilerType compileType, 
    bool skipShapeCompute,
    bool permitCodegen) {
    /** Size Compute and compute Const Begin */
    GeometryComputer::Context ctx(backupBackend);
    // Size Compute and compute Const
    // infos 为算子缓存，大小为 171
    for (int i=0; i<infos.size(); ++i) {
      // info 类型为 OpCacheInfo
        auto& info = infos[i];
        auto& cmdBufferVir = info.executeBuffer;
        auto& tempBuffer = info.cacheBuffer;
        // TODO: Optimize
        cmdBufferVir.command.clear();
        cmdBufferVir.extras.clear();
        for (auto t : info.outputs) {
            if (!TensorUtils::getDescribe(t)->isMutable) {
                continue;
            }
            auto usage = TensorUtils::getDescribe(t)->usage;
            auto type = TensorUtils::getDescribe(t)->memoryType;
            MNN_ASSERT(type != Tensor::InsideDescribe::MEMORY_OUTSIDE);
            MNN_ASSERT(type != Tensor::InsideDescribe::MEMORY_HOST);
            if (TensorUtils::getDescribeOrigin(t)->mContent->count() > 1) {
                TensorUtils::getDescribeOrigin(t)->mContent = new Tensor::InsideDescribe::NativeInsideDescribe;
                t->buffer().dim = TensorUtils::getDescribe(t)->dims;
                TensorUtils::getDescribe(t)->usage = usage;
            } else {
              // 不是不变的和可训练的
                if (info.type != Schedule::CONSTANT && usage != Tensor::InsideDescribe::TRAINABLE) {
                    TensorUtils::getDescribeOrigin(t)->mContent->setBackend(nullptr);
                    // TODO: If output is static and length larger than new size, don't clear mem
                    TensorUtils::getDescribeOrigin(t)->mContent->mem.reset(nullptr);
                }
            }
        }
        if (!skipShapeCompute) {
            auto res = SizeComputer::computeOutputSize(info.op, info.inputs, info.outputs);
            if (!res) {
                if (info.op->name() != nullptr) {
                    MNN_ERROR("Compute Shape Error for %s\n", info.op->name()->c_str());
                } else {
                    MNN_ERROR("Compute Shape Error for %d\n", info.op->type());
                }
                return COMPUTE_SIZE_ERROR;
            }
            // FIXME: Find better way to may compability for old model
            /**
             For Convolution of 2D / 3D Tensor(Dense / 1D Convolution)
             Because of old code, we will acces dim[2] / dim[3] to get width and height
             Set the lenght to 1 for compability
             */
            for (auto t : info.outputs) {
                TensorUtils::adjustTensorForCompability(t);
            }
        }

        if (info.type == Schedule::CONSTANT) {
            if (_hasZeroShapeOutput(info)) {
                continue;
            }
            ctx.clear();
            auto geo = GeometryComputer::search(info.op->type(), Runtime::Compiler_Loop);
            {
                auto res = geo->onRecompute(info.op, info.inputs, info.outputs, geoContext, tempBuffer);
                if (!res) {
                    tempBuffer.command.clear();
                    tempBuffer.extras.clear();
                    res = geo->onCompute(info.op, info.inputs, info.outputs, geoContext, tempBuffer);
                }
                if (!res) {
                    MNN_ERROR("Const Folder Error in geometry for %s\n", info.op->name()->c_str());
                    return NOT_SUPPORT;
                }
            }
            GeometryComputerUtils::makeRaster(tempBuffer, cmdBufferVir, ctx);
            for (auto t : info.outputs) {
                ctx.getRasterCacheCreateRecursive(t, cmdBufferVir);
            }
            for (auto& cp : cmdBufferVir.command) {
                auto& c = *cp;
                if (nullptr == c.execution) {
                    c.execution.reset(backupBackend->onCreate(c.inputs, c.outputs, c.op));
                }
                auto exe = c.execution;
                if (nullptr == exe.get()) {
                    MNN_ERROR("Const Folder Error for %s\n", info.op->name()->c_str());
                    return NO_EXECUTION;
                }
                for (auto t : c.outputs) {
                    auto des = TensorUtils::getDescribe(t);
                    TensorUtils::setLinearLayout(t);
                    auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC);
                    if (!res) {
                        return OUT_OF_MEMORY;
                    }
                    des->setBackend(backupBackend.get());
                }
                backupBackend->onResizeBegin();
                auto code = exe->onResize(c.inputs, c.outputs);
                if (NO_ERROR != code) {
                    return NOT_SUPPORT;
                }
                code = backupBackend->onResizeEnd();
                if (NO_ERROR != code) {
                    return NOT_SUPPORT;
                }
                code = exe->onExecute(c.inputs, c.outputs);
                if (NO_ERROR != code) {
                    return NOT_SUPPORT;
                }

            }
            // Clear const command
            ctx.pushCache(cmdBufferVir);
            cmdBufferVir.command.clear();
            cmdBufferVir.extras.clear();
        }


    }
    /** Size Compute and compute Const End */

    /** Geometry Transform */
    for (int i=0; i<infos.size(); ++i) {
        auto& info = infos[i];
        auto& cmdBufferReal = info.executeBuffer;
        auto& tempBuffer = info.cacheBuffer;
        // TODO: Optimize
        if (info.type == Schedule::CONSTANT) {
            continue;
        }
        if (_hasZeroShapeOutput(info)) {
            continue;
        }
        auto geo = GeometryComputer::search(info.op->type(), compileType);
        {
            bool res = false;
            if (!tempBuffer.hasWrap) {
                res = geo->onRecompute(info.op, info.inputs, info.outputs, geoContext, tempBuffer);
            }
            if (!res) {
                tempBuffer.command.clear();
                tempBuffer.extras.clear();
                res = geo->onCompute(info.op, info.inputs, info.outputs, geoContext, tempBuffer);
            }
            if (!res) {
                return NOT_SUPPORT;
            }
            tempBuffer.hasWrap = false;
            GeometryComputerUtils::makeRaster(tempBuffer, cmdBufferReal, geoContext);
            for (auto t : info.outputs) {
                auto des = TensorUtils::getDescribe(t);
                if (des->usage == Tensor::InsideDescribe::OUTPUT || des->usage == Tensor::InsideDescribe::TRAINABLE) {
                    // For output and trainable value, must directly compute the tensor
                    geoContext.getRasterCacheCreateRecursive(t, cmdBufferReal);
                }
            }
        }
    }

    
#ifdef MNN_BUILD_CODEGEN
    if(permitCodegen) {
        #ifdef LOG_VERPOSE
        MNN_PRINT("infos : [\n");
        for (auto info : infos) {
            auto& cmds = info.executeBuffer.command;
            for (auto cmd : cmds) {
                MNN_PRINT("\t%s", EnumNameOpType(cmd->op->type()));
                if(cmd->op->type() == OpType_BinaryOp) {
                    MNN_PRINT(" %d ", cmd->op->main_as_BinaryOp()->opType());
                }
                if(cmd->op->type() == OpType_UnaryOp) {
                    MNN_PRINT(" %d ", cmd->op->main_as_UnaryOp()->opType());
                }
                MNN_PRINT("\n");
            }
        }
        MNN_PRINT("]\n");
        MNN_PRINT("==================== opFuse ====================\n");
        #endif

        opFuse(infos, geoContext.forwardType(), geoContext.precisionType());

        #ifdef LOG_VERPOSE
        MNN_PRINT("infos : [\n");
        for (auto info : infos) {
            auto& cmds = info.executeBuffer.command;
            for (auto cmd : cmds) {
                MNN_PRINT("\t%s\n", EnumNameOpType(cmd->op->type()));
            }
        }
        MNN_PRINT("]\n");
        #endif
    }
#endif
    return NO_ERROR;
}

1.1.1.1.1.1 SizeComputer::computeOutputSize

维度计算。

// source/shape/SizeComputer.cpp
bool SizeComputer::computeOutputSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
                                     const std::vector<Tensor*>& outputs) {
    auto computeFactory = SizeComputerSuite::get();
    // When op is nullptr, it means a copy op
    if (nullptr != op) {
        // For Loop Op
        if (op->type() == OpType_While && op->main_type() == OpParameter_LoopParam) {
            auto loop = op->main_as_LoopParam();
            if (loop->extraTensorInfos() == nullptr) {
                return false;
            }
            MNN_ASSERT(loop->extraTensorInfos()->size() == outputs.size());
            for (int i=0; i<outputs.size(); ++i) {
                auto des = loop->extraTensorInfos()->GetAs<TensorDescribe>(i);
                MNN_ASSERT(des->blob() != nullptr);
                auto blob = des->blob();
                TensorUtils::getDescribe(outputs[i])->dimensionFormat = blob->dataFormat();
                outputs[i]->setType(blob->dataType());
                if (blob->dims() != nullptr) {
                    auto dims = blob->dims()->data();
                    outputs[i]->buffer().dimensions = blob->dims()->size();
                    for (int j=0; j<blob->dims()->size(); ++j) {
                        outputs[i]->setLength(j, dims[j]);
                    }
                } else {
                    outputs[i]->buffer().dimensions = 0;
                }
            }
            return true;
        }

        // Don't support compute shape for control flow op
        if (op->type() == OpType_While || op->type() == OpType_If) {
            return false;
        }
        // Check -1 input
        for (auto& t : inputs) {
            for (int i=0; i < t->dimensions(); ++i) {
                if (t->length(i) < 0) {
                    return false;
                }
            }
        }
        auto computer = computeFactory->search(op->type());
        if (nullptr != computer) {
            bool ret = computer->onComputeSize(op, inputs, outputs);
#ifdef MNN_DEBUG_TENSOR_SIZE
            _printShape(op, inputs, outputs);
#endif
            return ret;
        }
    }

    // Default Set to the same
    if (inputs.size() >= 1 && (outputs.size() == 1 || outputs.size() == inputs.size())) {
        if (inputs[0] == outputs[0]) {
            return true;
        }
        for (int i=0; i<outputs.size(); ++i) {
            const auto& ib = inputs[i]->buffer();
            auto& ob       = outputs[i]->buffer();
            memcpy(ob.dim, ib.dim, sizeof(halide_dimension_t) * ib.dimensions);
            ob.dimensions                                         = ib.dimensions;
            ob.type                                               = ib.type;
            TensorUtils::getDescribe(outputs[i])->dimensionFormat = TensorUtils::getDescribe(inputs[i])->dimensionFormat;
        }
#ifdef MNN_DEBUG_TENSOR_SIZE
        _printShape(op, inputs, outputs);
#endif
        return true;
    }
    // Not Support
    MNN_PRINT("Can't compute size for %d, name=%s\n", op->type(), op->name() ? op->name()->c_str() : "");

    return false;
}

1.1.1.1.1.1.1 SizeComputerSuite::search

// source/shape/SizeComputer.cpp
SizeComputer* SizeComputerSuite::search(OpType name) {
    auto iter = mRegistry[name];
    if (iter == nullptr) {
        return nullptr;
    }
    return iter;
}

1.1.1.1.1.1.1.1 维度计算初始化与注册

维度计算初始化与注册在 registerBackend 函数中调用 SizeComputerSuite::init() 来实现的。

static std::once_flag s_flag;
void registerBackend() {
    std::call_once(s_flag, [&]() {
    // ...
        SizeComputerSuite::init();
        GeometryComputer::init();
    // ...        
    });
}

SizeComputerSuite::init() 实现如下：

// source/shape/SizeComputer.cpp
void SizeComputerSuite::init() {
    if (nullptr != gInstance) {
        return;
    }
    gInstance = new SizeComputerSuite;
    gInstance->mRegistry.resize(OpType_MAX + 1);
    ::memset(gInstance->mRegistry.data(), 0, gInstance->mRegistry.size() * sizeof(SizeComputer*));
    registerShapeOps();
}

1.1.1.1.1.1.1.1.1 注册维度计算

registerShapeOps 用来注册维度计算，它通过调用一个个维度计算函数来实现注册。

// source/shape/ShapeRegister.cpp
void registerShapeOps() {
___ShapeSizeComputer__OpType_Shape__();
___ShapeRasterComputer__OpType_Raster__();
  // ...
}

如 ___ShapeSizeComputer__OpType_Shape__ 这样的函数是通过 REGISTER_SHAPE 宏定义的：

// source/shape/SizeComputer.hpp
#define REGISTER_SHAPE(name, op)                          \
    void ___##name##__##op##__() {                        \
        name* _temp = new name;                            \
        SizeComputerSuite* ts = SizeComputerSuite::get(); \
        ts->insert(_temp, op);                           \
    }

其实现代码如下：

// source/shape/ShapeShape.cpp
class ShapeSizeComputer : public SizeComputer {
    virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
                               const std::vector<Tensor*>& outputs) const override {
        MNN_ASSERT(1 <= inputs.size());
        MNN_ASSERT(1 == outputs.size());
        auto& ib = inputs[0]->buffer();
        auto& ob = outputs[0]->buffer();

        ob.dimensions = 1;
        outputs[0]->setType(DataType_DT_INT32);
        TensorUtils::getDescribe(outputs[0])->dimensionFormat = op->defaultDimentionFormat();
        auto inputFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
        if (inputFormat == MNN_DATA_FORMAT_NC4HW4 && op->defaultDimentionFormat() == MNN_DATA_FORMAT_NHWC) {
            // For compability
            ob.dim[0].extent = 4;
        } else {
            ob.dim[0].extent = ib.dimensions;
        }
        return true;
    }
};

REGISTER_SHAPE(ShapeSizeComputer, OpType_Shape);

REGISTER_SHAPE(ShapeSizeComputer, OpType_Shape) 宏扩展如下：

// REGISTER_SHAPE(ShapeSizeComputer, OpType_Shape) 
    void ___ShapeSizeComputer__OpType_Shape__() {
        ShapeSizeComputer* _temp = new ShapeSizeComputer;  
        SizeComputerSuite* ts = SizeComputerSuite::get();
        ts->insert(_temp, op);                       
    }

ts->insert 的实现代码如下：

// source/shape/SizeComputer.cpp
void SizeComputerSuite::insert(SizeComputer* t, OpType type) {
    mRegistry[type] = t;
}

由上可见，扩展后的代码正是一个函数，其把维度计算类（ShapeSizeComputer）注册到 mRegistry 中，函数名 ___ShapeSizeComputer__OpType_Shape__ 呼应了 registerShapeOps 函数中的调用。mRegistry 呼应了 SizeComputerSuite::search 函数的实现。

1.1.1.1.1.1.2 SizeComputer::onComputeSize

SizeComputer::computeOutputSize 函数中的 onComputeSize 如下：

    auto computer = computeFactory->search(op->type());
        if (nullptr != computer) {
            bool ret = computer->onComputeSize(op, inputs, outputs);

search 函数找到对应的维度计算实现（如：ConvolutionSizeComputer），然后调用其方法 onComputeSize。

备注：维度计算的所有实现都是基于 SizeComputer 基类的，实际运行中根据 op->type() 类型，调用不同的维度计算子类

如下为 ConvolutionSizeComputer::onComputeSize 的实现：

// source/shape/ShapeConvolution.cpp
virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
                               const std::vector<Tensor*>& outputs) const override {
        MNN_ASSERT(inputs.size() >= 1);
        MNN_ASSERT(1 == outputs.size());
        const Convolution2DCommon* layer = loadCommon(op);
        int kX = layer->kernelX();
        int kY = layer->kernelY();
        auto outputCount = layer->outputCount();
        if (inputs.size() > 1 && outputCount == 0) {
            // From TF's multi input convolution
            outputCount = inputs[1]->length(0);
            kX = inputs[1]->length(3);
            kY = inputs[1]->length(2);
        }
        int kernel_width  = layer->dilateX() * (kX - 1) + 1;
        int kernel_height = layer->dilateY() * (kY - 1) + 1;

        int output_width  = 1;
        int output_height = 1;

        auto input = inputs[0];
        if (input->dimensions() <= 1) {
            // Convolution is not valid for dimension <= 1
            return false;
        }

        auto inputCount = layer->inputCount();
        bool depthwiseMatch =
            inputCount == layer->outputCount() &&
            inputCount == layer->group() &&
            inputCount == input->channel();
        int commonChannelMatch =
            inputCount == inputs[0]->channel() ||            // real relationship in express
            (inputCount * layer->group() == input->channel()); // standard definition of group convolution
        bool valid = inputCount == 0 || depthwiseMatch || commonChannelMatch;

        // For Tensorflow Group Convolution, the inputCount is the size of filter's input count
        if (inputs.size() == 1 && !valid && OpType_Convolution == op->type()) {
            input->printShape();
            MNN_ERROR(
                "Error for compute convolution shape, inputCount:%d, outputCount:%d, KH:%d, KW:%d, group:%d\ninputChannel: %d, batch:%d, width:%d, height:%d. "
                "Input data channel may be mismatch with filter channel count\n",
                layer->inputCount(), outputCount, kY, kX, layer->group(),
                input->channel(), input->batch(), input->width(), input->height());
            return false;
        }

        if (layer->padMode() == PadMode_SAME) {
            // Tensorflow padding mode SAME
            output_width  = ceil((float)input->width() / (float)layer->strideX());
            output_height = ceil((float)input->height() / (float)layer->strideY());
        } else if (layer->padMode() == PadMode_VALID) {
            // Tensorflow padding mode VALID
            output_width  = ceil((float)(input->width() - kernel_width + 1) / (float)layer->strideX());
            output_height = ceil((float)(input->height() - kernel_height + 1) / (float)layer->strideY());
        } else {
            // Pad_Caffe means User setted padding
            if (nullptr != layer->pads()) {
                MNN_ASSERT(layer->pads()->size() >= 4);
                int input_width  = input->width() + layer->pads()->data()[1] + layer->pads()->data()[3];
                int input_height = input->height() + layer->pads()->data()[0] + layer->pads()->data()[2];
                output_width     = input_width < kernel_width ? 0 : (input_width - kernel_width) / layer->strideX() + 1;
                output_height    = input_height < kernel_height ? 0 : (input_height - kernel_height) / layer->strideY() + 1;
            } else {
                int input_width  = input->width() + layer->padX() * 2;
                int input_height = input->height() + layer->padY() * 2;
                output_width     = (input_width - kernel_width) / layer->strideX() + 1;
                output_height    = (input_height - kernel_height) / layer->strideY() + 1;
            }
        }

        auto& outputBuffer         = outputs[0]->buffer();
        outputBuffer.dimensions    = input->buffer().dimensions;
        auto format = TensorUtils::getDescribe(input)->dimensionFormat;
        outputBuffer.type = input->getType();
        if (op->main_as_Convolution2D() && op->main_as_Convolution2D()->symmetricQuan() && op->main_as_Convolution2D()->symmetricQuan()->outputDataType() != DataType_DT_INT8) {
            auto type = op->main_as_Convolution2D()->symmetricQuan()->outputDataType();
            outputs[0]->setType(type);
        }
        outputBuffer.dim[0].extent = input->buffer().dim[0].extent;
        if (MNN_DATA_FORMAT_NHWC == format) {
            outputBuffer.dim[3].extent = outputCount;
            outputBuffer.dim[1].extent = output_height;
            outputBuffer.dim[2].extent = output_width;
        } else {
            outputBuffer.dim[1].extent = outputCount;
            outputBuffer.dim[2].extent = output_height;
            outputBuffer.dim[3].extent = output_width;
        }
        // MNN_PRINT("outputs: %d, %d, %d, %d\n", outputs[0]->length(0), outputs[0]->length(1), outputs[0]->length(2), outputs[0]->length(3));
        TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
        return true;
    }

1.1.1.1.2 Pipeline::UnitInfo::setUp

在函数 Pipeline::encode 中调用 Pipeline::UnitInfo::setUp 函数的代码如下：

  static_cast<UnitInfo*>(cmd.info.get())->setUp(cmd, index++, info.op, totalIndex++);

其实现代码如下：

// source/core/Pipeline.cpp
void Pipeline::UnitInfo::setUp(const Command& command, int index, const Op* originOp, int totalIndex) {
    if (nullptr != command.op->name()) {
        mContent->name = command.op->name()->str();
    } else {
        if (nullptr != originOp && nullptr != originOp->name()) {
            char buffer[20];
            sprintf(buffer, "%d", index);
            mContent->name = originOp->name()->str() + "_raster_" + buffer;
        } else {
            char buffer[20];
            sprintf(buffer, "_raster_%d", totalIndex);
            mContent->name = buffer;
        }
    }
#ifdef MNN_OP_SEPERATE
    if (command.op->type() == OpType_UnaryOp) {
        mContent->type = EnumNameUnaryOpOperation(command.op->main_as_UnaryOp()->opType());
    } else if (command.op->type() == OpType_BinaryOp) {
        mContent->type = EnumNameBinaryOpOperation((BinaryOpOperation)(command.op->main_as_BinaryOp()->opType()));
    } else if (command.op->type() == OpType_Reduction) {
        mContent->type = EnumNameReductionType(command.op->main_as_ReductionParam()->operation());
    } else {
        mContent->type = EnumNameOpType(command.op->type());
    }
#else
    mContent->type = EnumNameOpType(command.op->type());
#endif
#ifndef MNN_BUILD_MINI
    mContent->flops = SizeComputer::computeFlops(command.op, command.inputs, command.outputs);
#endif
}

1.1.1.1.2.1 SizeComputer::computeFlops

// source/shape/SizeComputer.cpp
float SizeComputer::computeFlops(const MNN::Op* op, const std::vector<Tensor*>& inputs,
                                 const std::vector<Tensor*>& outputs) {
    auto computeFactory = SizeComputerSuite::get();
    auto computer       = computeFactory->search(op->type());
    if (nullptr != computer) {
        return computer->onComputeFlops(op, inputs, outputs);
    }
    if (op->type() == OpType_While && op->main_type() == OpParameter_LoopParam) {
        auto sumFlops = 0.0f;
        auto loop = op->main_as_LoopParam();
        if (nullptr != loop->commands()) {
            auto cmdSize = loop->commands()->size();
            for (int i=0; i<cmdSize; ++i) {
                auto cmd = loop->commands()->GetAs<RegionCommand>(i);
                auto size = cmd->size()->data();
                sumFlops += (float)size[0] * (float)size[1] * (float)size[2];
            }
        }
        sumFlops *= (float)loop->loopNumber();
        return sumFlops / 1024.0f / 1024.0f;
    }
    auto sumFlops = 0.0f;
    for (auto output : outputs) {
        sumFlops += (float)output->elementSize() / 1024.0f / 1024.0f;
    }
    return sumFlops;
}

1.1.1.1.2.1.1 SizeComputer::onComputeFlops

在函数 SizeComputer::computeFlops 中调用 SizeComputer::onComputeFlops 函数的代码如下：

    auto computeFactory = SizeComputerSuite::get();
    auto computer       = computeFactory->search(op->type());
    if (nullptr != computer) {
        return computer->onComputeFlops(op, inputs, outputs);
    }

onComputeFlops 函数是个虚函数， computer->onComputeFlops 调用是个多态，其基类为 SizeComputer，我们选择一个实例 ConvolutionSizeComputer 进行分析，其具体实现代码如下：

// source/shape/ShapeConvolution.cpp
class ConvolutionSizeComputer : public SizeComputer {
    virtual float onComputeFlops(const MNN::Op* op, const std::vector<Tensor*>& inputs,
                                 const std::vector<Tensor*>& outputs) const override {
        const Convolution2DCommon* layer = loadCommon(op);
        auto kw    = layer->kernelX();
        auto kh    = layer->kernelY();
        auto group = layer->group();
        auto ic    = inputs[0]->channel();
        auto oc    = outputs[0]->channel();
        auto oSize = outputs[0]->width() * outputs[0]->height() * outputs[0]->batch();
        if (op->type() == OpType_QuantizedDepthwiseConv2D) {
            group = ic;
        }
        if (layer->inputCount() != ic && layer->inputCount() > 0) {
            group = ic / layer->inputCount();
        }
        auto flops = (float)oSize * kw * kh * (ic * oc / (group == 0 ? 1 : group)) / FLOPS_M;
        return flops;
    }
}

1.1.1.2 Command 命令

// source/core/Command.hpp
struct Command : public RefCount {
    const Op* op;
    std::vector<Tensor*> workInputs;
    std::vector<Tensor*> workOutputs;
    std::vector<Tensor*> inputs;
    std::vector<Tensor*> outputs;
    std::shared_ptr<BufferStorage> buffer;
    std::shared_ptr<Execution> execution;
    std::shared_ptr<OperatorInfo> info;
    #ifdef MNN_BUILD_CODEGEN
    bool canVectorize = false;
    #endif
};

1.1.1.3 CommandBuffer

// source/core/Command.hpp
struct CommandBuffer {
    std::vector<SharedPtr<Command>> command;
    std::vector<std::shared_ptr<Tensor>> extras;
    bool hasWrap = false;
};

MNN Session 之维度计算（五）