MNN Session 之 CPU 算子(七)

简介: MNN Session 之 CPU 算子(七)

1、createSession

    依据 ScheduleConfig 和 RuntimeInfo 创建会话。

// source/core/Interpreter.cpp
Session* Interpreter::createSession(const ScheduleConfig& config, const RuntimeInfo& runtime) {
    return createMultiPathSession({config}, runtime);
}

1.1 createMultiPathSession

createMultiPathSession 完整代码

// source/core/Interpreter.cpp
Session* Interpreter::createMultiPathSession(const std::vector<ScheduleConfig>& configs, const RuntimeInfo& runtime) {
  // ...
    auto result = newSession.get();
    auto validForResize = info.validForResize;
    if (validForResize && mNet->modes.inputMode == Session_Input_Inside && mNet->modes.resizeMode == Session_Resize_Direct) {
        result->resize();
    }
    // ...
    return result;
}

1.1.1 Session::resize

Session::resize 完整代码

// source/core/Session.cpp
ErrorCode Session::resize() {
  // ...
    if (mNeedResize) {
        bool debug = mCallBackMode == Interpreter::Session_Debug;
        // mPipelines 类型为 std::vector<std::shared_ptr<Pipeline>>
        for (auto& iter : mPipelines) {
            auto error = iter->encode(debug, permitCodegen);
            if (NO_ERROR != error) {
                return error;
            }
        }
        mNeedResize = false;
        mNeedMalloc = true;
        firstMalloc = true;
    }
    // ...
}

1.1.1.1 Pipeline::encode

Pipeline::encode 完整代码

BackendCacheOpCacheInfo

// source/core/Pipeline.cpp
// typedef std::pair<BackendCache, std::vector<OpCacheInfo>> PipelineInfo;
//
//   struct BackendCache {
//      Backend::Info info;
//      BackendConfig config;
//      std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>> cache;
//      bool needComputeShape = true;
//      bool needComputeGeometry = true;
//      bool reportError = true;
//      std::map<Tensor*, TENSORCACHE> inputTensorCopyCache;
//  };
//
//    /** pipeline info */
//    struct OpCacheInfo {
//        /** op */
//        const Op* op;
//        /** input tensors */
//        std::vector<Tensor*> inputs;
//        /** output tensors */
//        std::vector<Tensor*> outputs;
//        /** schedule type*/
//        Schedule::Type type = Schedule::Type::SEPARATE;
//
//        /**Command buffer for cache*/
//        CommandBuffer cacheBuffer;
//
//        /**Command buffer for execute*/
//        CommandBuffer executeBuffer;
//        
//        std::map<const Op*, std::shared_ptr<Execution>> executionCache;
//    };
//
ErrorCode Pipeline::encode(bool supportDebug, bool permitCodegen) {
  // mInfo.first.cache 类型为 std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>>
  // mBackend 创建的后端如(VulkanBackend)
    auto& mBackend = mInfo.first.cache.first;
    // mBackupBackend 创建的后备(默认)后端如(CPUBackend)
    auto& mBackupBackend = mInfo.first.cache.second;
    // Static Model just copy info to command buffer
    // mInfo.first 类型为 BackendCache 
    if (!mInfo.first.needComputeGeometry) {
        // ...
    } else {
#ifndef MNN_BUILD_MINI
    // mContext 类型为 GeometryComputer::Context
        mContext.clear();
        /** Size Compute and compute Const Begin */
        auto res = GeometryComputerUtils::shapeComputeAndGeometryTransform(mInfo.second, mContext, mInfo.first.cache.second, mUseGeometry, false, permitCodegen);
        if (res != NO_ERROR) {
            return res;
        }
#endif
    }
  // ...
    return NO_ERROR;
}

1.1.1.1.1 GeometryComputerUtils::shapeComputeAndGeometryTransform

GeometryComputerUtils::shapeComputeAndGeometryTransform 完整代码

OpCacheInfo

// source/geometry/GeometryComputerUtils.cpp
//    /** pipeline info */
//    struct OpCacheInfo {
//        /** op */
//        const Op* op;
//        /** input tensors */
//        std::vector<Tensor*> inputs;
//        /** output tensors */
//        std::vector<Tensor*> outputs;
//        /** schedule type*/
//        Schedule::Type type = Schedule::Type::SEPARATE;
//
//        /**Command buffer for cache*/
//        CommandBuffer cacheBuffer;
//
//        /**Command buffer for execute*/
//        CommandBuffer executeBuffer;
//        
//        std::map<const Op*, std::shared_ptr<Execution>> executionCache;
//    };
//
ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
    std::vector<Schedule::OpCacheInfo>& infos,
    GeometryComputer::Context& geoContext,
    std::shared_ptr<Backend> backupBackend,
    Runtime::CompilerType compileType, 
    bool skipShapeCompute,
    bool permitCodegen) {
    /** Size Compute and compute Const Begin */
    GeometryComputer::Context ctx(backupBackend);
    // Size Compute and compute Const
    // infos 为算子缓存,大小为 171
    for (int i=0; i<infos.size(); ++i) {
      // info 类型为 OpCacheInfo
        auto& info = infos[i];
        auto& cmdBufferVir = info.executeBuffer;
        auto& tempBuffer = info.cacheBuffer;
        // ...
        if (info.type == Schedule::CONSTANT) {
            // ...
            for (auto& cp : cmdBufferVir.command) {
                auto& c = *cp;
                if (nullptr == c.execution) {
                    c.execution.reset(backupBackend->onCreate(c.inputs, c.outputs, c.op));
                }
                auto exe = c.execution;
                if (nullptr == exe.get()) {
                    MNN_ERROR("Const Folder Error for %s\n", info.op->name()->c_str());
                    return NO_EXECUTION;
                }
                for (auto t : c.outputs) {
                    auto des = TensorUtils::getDescribe(t);
                    TensorUtils::setLinearLayout(t);
                    auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC);
                    if (!res) {
                        return OUT_OF_MEMORY;
                    }
                    des->setBackend(backupBackend.get());
                }
                backupBackend->onResizeBegin();
                auto code = exe->onResize(c.inputs, c.outputs);
                if (NO_ERROR != code) {
                    return NOT_SUPPORT;
                }
                code = backupBackend->onResizeEnd();
                if (NO_ERROR != code) {
                    return NOT_SUPPORT;
                }
                code = exe->onExecute(c.inputs, c.outputs);
                if (NO_ERROR != code) {
                    return NOT_SUPPORT;
                }

            }
            // Clear const command
            ctx.pushCache(cmdBufferVir);
            cmdBufferVir.command.clear();
            cmdBufferVir.extras.clear();
        }
    }
    /** Size Compute and compute Const End */
  // ...
    return NO_ERROR;
}

1.1.1.1.1.1 CPUBackend::onCreate

    在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用

CPUBackend::onCreate 函数的代码如下:

      for (auto& cp : cmdBufferVir.command) {
                auto& c = *cp;
                if (nullptr == c.execution) {
                    c.execution.reset(backupBackend->onCreate(c.inputs, c.outputs, c.op));
                }

    由于传入的 backupBackendCPUBackend(继承 Backend)。CPUBackend::onCreate 具体实现代码如下:

// source/backend/cpu/CPUBackend.cpp
/// get execution
Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const MNN::Op* op) {
    /**
     BatchNorm it will be converted to scale
     for model convert, don't print error log
     */
    if (op->type() == OpType_BatchNorm) {
        return nullptr;
    }
    auto opType = op->type();
    if (outputs.size() > 0) {
        if (TensorUtils::getDescribe(outputs[0])->quantAttr != nullptr && TensorUtils::getDescribe(outputs[0])->type == DataType_DT_INT8) {
            opType = _getRealOpType(opType);
        }
    }

    // TODO: rm this convert when merge diff datatyoe of op
    auto map  = gCreator;
    auto iter = map->find(opType);
    if (iter == map->end()) {
        MNN_PRINT("Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name()->c_str());
        return nullptr;
    }
    Execution* exe = nullptr;
    bool needCast = false;
    if (exe == nullptr) {
        exe = iter->second->onCreate(inputs, outputs, op, this);
    }
    return exe;
}

1.1.1.1.1.1.1 CPUBackend::Creator::onCreate

    在函数 CPUBackend::onCreate 中调用 CPUBackend::Creator::onCreate 函数的代码如下:

    auto map  = gCreator;
    auto iter = map->find(opType);
    // ...
    Execution* exe = nullptr;
    bool needCast = false;
    if (exe == nullptr) {
      // 根据 opType 创建算子执行器
        exe = iter->second->onCreate(inputs, outputs, op, this);
    }

    备注:iter->second->onCreate 调用是个多态,实际运行中根据算子类型 opType ,调用不同的子类。其基类为 CPUBackend::Creator 。

  其中一个实现类为 CPURasterFactory ,具体实现代码如下:

// source/backend/cpu/CPURaster.cpp
class CPURasterFactory : public CPUBackend::Creator {
public:
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const MNN::Op* op, Backend* backend) const {
        if (op->type() == OpType_While) {
            if (op->main_type() != OpParameter_LoopParam) {
                return nullptr;
            }
            return new CPULoop(backend, op->main_as_LoopParam());
        }
        return new CPURaster(backend);
    }
};

REGISTER_CPU_OP_CREATOR(CPURasterFactory, OpType_Raster);
REGISTER_CPU_OP_CREATOR(CPURasterFactory, OpType_While);

1.1.1.1.1.1.2 Backend

    Backend 为后端基类,其具体实现如下:

// source/core/Backend.hpp
/** abstract backend */
class Backend : public NonCopyable {

public:
    /** info used to create backend */
    struct Info {
        /** forward type. */
        MNNForwardType type = MNN_FORWARD_CPU;
        /** numThread for CPU . number of threads.  gpuMode for GPU only. tuning/memory Mode setting. */
        union {
            int numThread = 4;
            int gpuMode;
        };
        /** user data. */
        BackendConfig* user = NULL;
        enum Mode {
            // The Op will be run in execution->onExecute
            DIRECT = 0,

            // The Op will be recorded. Run in onExecuteBegin and Wait in onExecuteEnd
            INDIRECT = 1
        };
        Mode mode = DIRECT;
        enum Allocator {
            DEFER = 0,
            EAGER = 1
        };
        Allocator allocator = DEFER;
    };

    /** backend buffer storage type */
    enum StorageType {
        /**
         use NOT reusable memory.
         - allocates memory when `onAcquireBuffer` is called.
         - releases memory when `onReleaseBuffer` is called or when the backend is deleted.
         - do NOTHING when `onClearBuffer` is called.
         */
        STATIC,
        /**
         use reusable memory.
         - allocates or reuses memory when `onAcquireBuffer` is called. prefers reusing.
         - collects memory for reuse when `onReleaseBuffer` is called.
         - releases memory when `onClearBuffer` is called or when the backend is deleted.
         */
        DYNAMIC,
        /**
         use NOT reusable memory.
         - allocates memory when `onAcquireBuffer` is called.
         - do NOTHING when `onReleaseBuffer` is called.
         - releases memory when `onClearBuffer` is called or when the backend is deleted.
         */
        DYNAMIC_SEPERATE
    };

public:
    /**
     * @brief initializer.
     * @param type  forward type.
     */
    Backend(MNNForwardType type) : mType(type) {
        // nothing to do
    }

    /**
     * @brief deinitializer.
     */
    virtual ~Backend() = default;

public:

    /**
     * @brief create execution for op with input and output tensors.
     * @param inputs    input tensors.
     * @param outputs   output tensors.
     * @param op        given op.
     * @return created execution if op is supported, nullptr otherwise.
     */
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const MNN::Op* op) = 0;

    /**
     * @brief callback before resize ops.
     */
    virtual void onResizeBegin() {
        // nothing to do
    }
    /**
     * @brief callback after resize ops.
     */
    virtual ErrorCode onResizeEnd() = 0;

    /**
     * @brief callback before executing ops.
     */
    virtual void onExecuteBegin() const = 0;
    /**
     * @brief callback after executing ops.
     */
    virtual void onExecuteEnd() const = 0;

    virtual const Runtime* getRuntime() {
        return nullptr;
    }
    const std::string externalFile();
public:
    /**
     * @brief allocate buffer of tensor for given storage type.
     * @param tensor        buffer provider.
     * @param storageType   buffer storage type.
     * @return success or not.
     */
    MNN_PUBLIC bool onAcquireBuffer(const Tensor* tensor, StorageType storageType);

    /**
     * @brief release buffer of tensor for given storage type.
     * @param tensor        buffer provider.
     * @param storageType   buffer storage type.
     * @return success or not.
     */
    MNN_PUBLIC bool onReleaseBuffer(const Tensor* tensor, StorageType storageType);

    class MemObj {
    public:
        MemObj() {}
        virtual ~ MemObj() {}
        virtual MemChunk chunk() { return MemChunk(); }
    };
    /**
     * @brief allocate buffer of tensor for given storage type.
     * @param tensor        buffer provider.
     * @param storageType   buffer storage type.
     * @return MemObj for release, if failed, return nullptr.
     */
    virtual MemObj* onAcquire(const Tensor* tensor, StorageType storageType) = 0;
    
    /**
     * @brief get buffer from tensor directly
     * @param tensor        buffer provider.
     * @return support or not
     */
    virtual bool onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
        return false;
    }

    /**
     * @brief clear all dynamic buffers.
     * @return success or not.
     */
    virtual bool onClearBuffer() = 0;

    /**
     * @brief copy buffer from tensor to tensor.
     * @param srcTensor source buffer provider.
     * @param dstTensor dest buffer provider.
     */
    virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const = 0;

public:
    /**
     * @brief get forward type.
     * @return forward type.
     */
    inline MNNForwardType type() const {
        return mType;
    }

public:
    /**
     * @brief get Gpu Tensor map host ptr/ unmap
     */
    virtual void* onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) {
        return nullptr;
    }

    virtual bool onUnmapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* dstTensor, void* mapPtr) {
        return false;
    }

    virtual int onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
        return 0;
    }

private:
    const MNNForwardType mType;
};

1.1.1.1.1.1.3 Execution

    Execution 为具体的算子执行实现,其具体实现如下:

// source/core/Execution.hpp
/** abstract execution */
class Execution : public NonCopyable {
public:
    /**
     * @brief initializer.
     * @param backend   backend that exection will running on.
     */
    Execution() = delete;
    Execution(Backend *backend) : mBackEnd(backend) {
        // nothing to do
    }
    /**
     * @brief deinitializer.
     */
    virtual ~Execution() = default;

    /**
     * @brief response shape change of input or output tensors.
     * @param inputs    input tensors
     * @param outputs   output tensors
     * @return resize result
     */
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
        return NO_ERROR;
    }

    /**
     * @brief perform execution.
     * @param inputs    input tensors
     * @param outputs   output tensors
     * @return execution result
     */
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) = 0;

    /**
     * @brief clone execution, new execution will share weight from this execution
     * @param bn   the cloned' execution's backend
     * @param dst if dst = nullptr, just return whether execution can clone, otherwise clone the execution into dst
     * @return execution result
     */
    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) {
        return false;
    }
public:
    /**
     * @brief designed for plugin system. not ready yet.
     */
    class Creator : public NonCopyable {
    public:
        /**
         * @brief deinitializer.
         */
        virtual ~Creator() = default;
        /**
         * @brief create execution for given op on given backend.
         * @param backend   given backend.
         * @param op        given op.
         * @return execution.
         */
        virtual Execution *onCreate(Backend *backend, const Op *op) const = 0;
    };

    // Search for extra creator, if not found, return nullptr
    MNN_PUBLIC static const Creator *searchExtraCreator(const std::string &key, MNNForwardType type);

    /**
     * @brief register creator for given key and backend type.
     * @param creator registering creator.
     * @param key given key.
     * @param type given backend type.
     * @return false if registered creator for same key and type exists, true otherwise.
     */
    MNN_PUBLIC static bool insertExtraCreator(std::shared_ptr<Creator> creator, const std::string &key,
                                              MNNForwardType type);

    /**
     * @brief unregister creator for given key and backend type.
     * @param key given key.
     * @param type given backend type.
     * @return true if registered creator for given key and type exists, false otherwise.
     */
    MNN_PUBLIC static bool removeExtraCreator(const std::string &key, MNNForwardType type);

public:
    /**
     * @brief check if execution is valid.
     * @return valid or not.
     */
    inline bool valid() const {
        return mValid;
    }
    /**
     * @brief get backend.
     * @return backend.
     */
    Backend *backend() const {
        return mBackEnd;
    }

protected:
    bool mValid = true;

private:
    Backend *mBackEnd;
};

1.1.1.1.1.1.4 CPU 算子执行实例注册

    CPUBackend::onCreate 函数中有个 gCreator 成员,其缓存了所有的 CPU 算子执行创建实

CPUBackend::Creator,其初始化与注册在 registerBackend 函数中调用 registerCPURuntimeCreator 来实现的。

// source/core/BackendRegister.cpp
static std::once_flag s_flag;
void registerBackend() {
    std::call_once(s_flag, [&]() {
    // ...
        registerCPURuntimeCreator();
    // ...        
    });
}

  registerCPURuntimeCreator() 实现如下:

// source/backend/cpu/CPUBackend.cpp
void registerCPURuntimeCreator() {
    CPUBackend::initCreatorMap();
    registerCPUOps();
#ifdef MNN_SUPPORT_BF16
    registerBF16Backend();
#endif
#ifdef MNN_USE_ARMV82
    registerArm82RuntimeCreator();
#endif
    // TODO: Merge _initCoreFunction MNNFunctionInit and cpuinfo_arm_init
    MNNCoreFunctionInit();
    MNNInsertExtraRuntimeCreator(MNN_FORWARD_CPU, new CPURuntimeCreator);
};

    registerCPUOps 函数注册了所有的 CPU 的算子执行实例 Execution,其代码如下:

// source/backend/cpu/CPUOPRegister.cpp
void registerCPUOps() {
___CPUCropAndResizeCreator__OpType_CropAndResize__();
___CPUArgMaxCreator__OpType_ArgMax__();
___CPUArgMaxCreator__OpType_ArgMin__();
  // ...
}

    函数 ___CPUArgMaxCreator__OpType_ArgMax__ 是通过 REGISTER_CPU_OP_CREATOR 宏定义的:

// source/backend/cpu/CPUBackend.hpp
#define REGISTER_CPU_OP_CREATOR(name, opType)     \
    void ___##name##__##opType##__() {            \
        static name _temp;\
        CPUBackend::addCreator(opType, &_temp); \
    }

    其实现代码如下:

// source/backend/cpu/CPUArgMax.cpp
class CPUArgMaxCreator : public CPUBackend::Creator {
public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                const MNN::Op *op, Backend *backend) const {
        auto argMax = op->main_as_ArgMax();
        if (op->type() == OpType_ArgMin) {
            return new CPUArgMax(backend, CPUArgMax::ArgMinOrMax::ARGMIN,
                    argMax->topK(), argMax->outMaxVal(), argMax->softmaxThreshold(), argMax->axis());
        } else {
            return new CPUArgMax(backend, CPUArgMax::ArgMinOrMax::ARGMAX,
                    argMax->topK(), argMax->outMaxVal(), argMax->softmaxThreshold(), argMax->axis());
        }
    }
};
REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMax);
REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMin);

    REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMax) 宏扩展如下:

// REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMax)
    void ___CPUArgMaxCreator__OpType_ArgMax__() { 
        static CPUArgMaxCreator _temp;
        CPUBackend::addCreator(OpType_ArgMax, &_temp);
    }

  注册是通过函数 CPUBackend::addCreator 实现的,其实现如下:

// source/backend/cpu/CPUBackend.cpp
bool CPUBackend::addCreator(OpType t, Creator* c) {
    auto map = gCreator;
    if (map->find(t) != map->end()) {
        MNN_PRINT("Error: %d type has be added\n", t);
        return false;
    }
    map->insert(std::make_pair(t, c));
    return true;
}

    由代码可知,创建器最终注册到 gCreator 中。

    综上可见,扩展后的代码正是一个函数,函数名 ___CPUArgMaxCreator__OpType_ArgMax__呼应了 registerCPUOps 函数中的调用。gCreator 呼应了 CPUBackend::onCreate 函数的实现。

添加CPU实现

source/backend/CPU目录下添加CPUMyCustomOp.hppCPUMyCustomOp.cpp

  1. 实现类声明
class CPUMyCustomOp : public Execution {
public:
    // 若执行onExecute需要使用缓存,在此函数中申请,若无可不声明
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, 
                               const std::vector<Tensor *> &outputs) override;
    // 具体的Op执行函数
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, 
                                const std::vector<Tensor *> &outputs) override;
};


2.实现onResizeonExecute

onResize中,调用backend()->onAcquireBuffer(&mCache, Backend::DYNAMIC)进行缓存的申请,调用backend()->onReleaseBuffer(&mCache, Backend::DYNAMIC)回收缓存。释放后的内存可以被复用。

onExecute中,做必要的输入的检查,有利于提前发现问题。若执行完毕正确返回NO_ERROR。


3.注册实现类

class CPUMyCustomOpCreator : public CPUBackend::Creator {
public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, 
                                const std::vector<Tensor *> &outputs, 
                                const MNN::Op *op,
                                Backend *backend) const override {
        return new CPUMyCustomOp(backend);
    }
};
REGISTER_CPU_OP_CREATOR(CPUMyCustomOpCreator, OpType_MyCustomOp);

1.1.1.1.1.2 Backend::onAcquireBuffer

    在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用

Backend::onAcquireBuffer 函数的代码如下:

  auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC);

    onAcquireBuffer 只存在 Backend 基类中,其主要用来为张量 tensor 分配内存,具体实现代码如下:

bool Backend::onAcquireBuffer(const Tensor* tensor, StorageType storageType) {
    auto mem = this->onAcquire(tensor, storageType);
    if (nullptr == mem) {
        return false;
    }
    if (mem == TensorUtils::getDescribe(tensor)->mem.get()) {
        return true;
    }
    TensorUtils::getDescribe(tensor)->mem.reset(mem);
    return true;
}

    onAcquireBuffer 函数中调用 onAcquire 函数,这是个虚函数,由于传入的 backupBackendCPUBackend(继承 Backend),所以实际调用的是 CPUBackend::onAcquire,其具体实现代码如下:

// source/backend/cpu/CPUBackend.cpp
Backend::MemObj* CPUBackend::onAcquire(const MNN::Tensor* nativeTensorConst, StorageType storageType) {
    if (nativeTensorConst == nullptr) {
        return nullptr;
    }
    //FUNC_PRINT_ALL(nativeTensorConst, p);
    auto nativeTensor = (Tensor*)nativeTensorConst;
    auto size = getTensorSize(nativeTensor, true);
    return allocBuffer(size, nativeTensor, storageType);
}

1.1.1.1.1.3 Backend::onResizeBegin

    在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用

Backend::onResizeBegin 函数的代码如下:

                backupBackend->onResizeBegin();
                auto code = exe->onResize(c.inputs, c.outputs);
                // ...
                code = backupBackend->onResizeEnd();
                // ...
                code = exe->onExecute(c.inputs, c.outputs);
                // ...

    onResizeBegin 函数是个虚函数,由于传入的 backupBackendCPUBackend(继承Backend),所以实际调用的是 CPUBackend::onResizeBegin,其具体实现代码如下:

// source/backend/cpu/CPUBackend.cpp
void CPUBackend::onResizeBegin() {
    mDynamicAllocator->reset();
}

1.1.1.1.1.4 Execution::onResize

    在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用

Execution::onResize 函数的代码如下:

                backupBackend->onResizeBegin();
                auto code = exe->onResize(c.inputs, c.outputs);
                // ...
                code = backupBackend->onResizeEnd();
                // ...
                code = exe->onExecute(c.inputs, c.outputs);
                // ...

    onResize 函数是个虚函数, exe 创建逻辑见 CPUBackend::onCreateexe->onResize 调用是个多态,其基类为 Execution,我们选择一个实例 CPULoop 进行分析,其具体实现代码如下:

// source/backend/cpu/CPURaster.cpp
class CPULoop : public Execution {
      virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
        int inputIndexSize = mLoop->inputIndexes()->size();
        MNN_ASSERT(inputIndexSize == inputs.size());
        for (int i=0; i<inputIndexSize; ++i) {
            mStack[mLoop->inputIndexes()->data()[i]] = inputs[i];
        }
        int outputIndexSize = mLoop->outputIndexes()->size();
        MNN_ASSERT(outputIndexSize == outputs.size());
        for (int i=0; i<outputIndexSize; ++i) {
            mStack[mLoop->outputIndexes()->data()[i]] = outputs[i];
        }
        int numberThread = mLoop->parallel() ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
        mMaxCacheSize = 0;
        auto bytes = static_cast<CPUBackend*>(backend())->functions()->bytes;
        mMaxFuseBufferSize = 0;
        for (int i=0; i<mLoop->commands()->size(); ++i) {
            auto cmd = mLoop->commands()->GetAs<RegionCommand>(i);
            auto op = cmd->op();
            if (cmd->fuse() >= 0) {
                // Make Temp output buffer
                auto size = cmd->size()->data();
                if (cmd->op()->type() == OpType_MatMul) {
                    mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[2]);
                } else {
                    mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[1] * size[2]);
                }
            }
            if (OpType_UnaryOp == op->type()) {
                if (nullptr != op->main_as_UnaryOp()) {
                    auto view0 = cmd->view()->GetAs<View>(0);
                    auto view1 = cmd->view()->GetAs<View>(1);
                    MNN_ASSERT(view0->stride()->data()[2] == 1 || cmd->fuse() >= 0);
                    if (view1->stride()->data()[2] != 1) {
                        mMaxCacheSize = std::max(mMaxCacheSize, cmd->size()->data()[2] * bytes);
                    }
                }
                continue;
            }
            if (OpType_BinaryOp == op->type()) {
                auto view0 = cmd->view()->GetAs<View>(0);
                auto view1 = cmd->view()->GetAs<View>(1);
                auto view2 = cmd->view()->GetAs<View>(2);
                MNN_ASSERT(view0->stride()->data()[2] == 1 || cmd->fuse() >= 0);
                if (view1->stride()->data()[2] != 1 || view2->stride()->data()[2] != 1) {
                    mMaxCacheSize = std::max(mMaxCacheSize, 2 * cmd->size()->data()[2] * bytes);
                }
                continue;
            }
            if (OpType_MatMul == op->type()) {
                bool transposeC = true;
                int e = cmd->size()->data()[0];
                int l = cmd->size()->data()[1];
                int h = cmd->size()->data()[2];
                std::shared_ptr<Tensor> A, B, C, Bias;
                C.reset(Tensor::createDevice<float>({e, h}));
                if (op->main_as_MatMul()->transposeA()) {
                    A.reset(Tensor::createDevice<float>({l, e}));
                } else {
                    A.reset(Tensor::createDevice<float>({e, l}));
                }
                if (op->main_as_MatMul()->transposeB()) {
                    B.reset(Tensor::createDevice<float>({h, l}));
                } else {
                    B.reset(Tensor::createDevice<float>({l, h}));
                }
                auto view = cmd->view()->GetAs<View>(0);
                if (view->stride()->data()[0] == 1) {
                    transposeC = false;
                }
                std::vector<Tensor*> inputs, outputs;
                if (cmd->indexes()->size() > 3) {
                    Bias.reset(Tensor::createDevice<float>({h}));
                    inputs = {A.get(), B.get(), Bias.get()};
                } else {
                    inputs = {A.get(), B.get()};
                }
                outputs = {C.get()};
                auto bufferPool = static_cast<CPUBackend*>(backend())->getBufferAllocator();
                auto code = NO_ERROR;
                if (numberThread > 1) {
                    bufferPool->barrierBegin();
                }
                for (int v=0; v<numberThread; ++v) {
                    if (numberThread > 1) {
                        bufferPool->beginGroup();
                    }
                    do {
                        // If not loop parallel, parallel inside
                        bool needParallel = numberThread == 1;
                        mContainer[v].exe[i].reset(new CPUMatMul(backend(), op->main_as_MatMul()->transposeA(),  op->main_as_MatMul()->transposeB(), transposeC, needParallel));
                        if (nullptr == mContainer[v].exe[i]) {
                            code = OUT_OF_MEMORY;
                            break;
                        }
                        code = mContainer[v].exe[i]->onResize(inputs, outputs);
                    } while (false);
                    if (numberThread > 1) {
                        bufferPool->endGroup();
                    }
                    if (NO_ERROR != code) {
                        break;
                    }
                }
                if (numberThread > 1) {
                    bufferPool->barrierEnd();
                }
                if (NO_ERROR != code) {
                    return code;
                }
                continue;
            }
        }
        auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
        if (mMaxCacheSize > 0 || mMaxFuseBufferSize > 0) {
            mCacheBuffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize));
            if (mCacheBuffer.invalid()) {
                return OUT_OF_MEMORY;
            }
            mFuseBuffer = mCacheBuffer + threadNumber * mMaxCacheSize;
            static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(mCacheBuffer);
        }
        return NO_ERROR;
    }
}

1.1.1.1.1.5 Backend::onResizeEnd

    在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用

Backend::onResizeEnd 函数的代码如下:

                backupBackend->onResizeBegin();
                auto code = exe->onResize(c.inputs, c.outputs);
                // ...
                code = backupBackend->onResizeEnd();
                // ...
                code = exe->onExecute(c.inputs, c.outputs);
                // ...

    onResizeEnd 函数是个虚函数,由于传入的 backupBackendCPUBackend(继承 Backend),所以实际调用的是 CPUBackend::onResizeEnd,其具体实现代码如下:

// source/backend/cpu/CPUBackend.cpp
ErrorCode CPUBackend::onResizeEnd() {
    getCache()->release();
    return mDynamicAllocator->compute();
}

1.1.1.1.1.6 Execution::onExecute

    在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用

Execution::onExecute 函数的代码如下:

                backupBackend->onResizeBegin();
                auto code = exe->onResize(c.inputs, c.outputs);
                // ...
                code = backupBackend->onResizeEnd();
                // ...
                code = exe->onExecute(c.inputs, c.outputs);
                // ...

    onExecute 函数是个虚函数, exe 创建逻辑见 CPUBackend::onCreateexe->onExecute 调用是个多态,其基类为 Execution,我们选择一个实例 CPULoop 进行分析,其具体实现代码如下:

// source/backend/cpu/CPURaster.cpp
class CPULoop : public Execution {
    virtual ErrorCode onExecute(const std::vector<Tensor *> &originInputs, const std::vector<Tensor *> &originOutputs) override {
        auto cpubackend = static_cast<CPUBackend*>(backend());
        auto precision = cpubackend->precisionMode();
        auto threadNumber = cpubackend->threadNumber();
        if (mLoop->initCommand() != nullptr) {
            for (int i=0; i<mLoop->initCommand()->size(); ++i) {
                auto cmd = mLoop->initCommand()->GetAs<RegionCommand>(i);
                if (cmd->op() == nullptr) {
                    auto output = mStack[cmd->indexes()->data()[0]];
                    ::memset(output->host<void>(), 0, cpubackend->getTensorSize(output) * cpubackend->functions()->bytes);
                } else {
                    Tensor::InsideDescribe::Region reg;
                    auto srcView = cmd->view()->GetAs<View>(1);
                    auto dstView = cmd->view()->GetAs<View>(0);
                    ::memcpy(reg.size, cmd->size()->data(), 3 * sizeof(int32_t));
                    ::memcpy(reg.src.stride, srcView->stride()->data(), 3 * sizeof(int32_t));
                    ::memcpy(reg.dst.stride, dstView->stride()->data(), 3 * sizeof(int32_t));
                    auto input = mStack[cmd->indexes()->data()[1]];
                    auto inputSize = input->elementSize();
                    auto output = mStack[cmd->indexes()->data()[0]];
                    auto bytes = input->getType().bytes();
                    if (halide_type_float == input->getType().code) {
                        bytes = cpubackend->functions()->bytes;
                    }
                    _blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>());
                }

            }
        }
        if (1 == mLoop->commands()->size()) {
            auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
            auto op = cmd->op();
            if (OpType_UnaryOp == op->type() && nullptr == op->main() && cmd->fuse() < 0) {
                // For Gather / Single Unary
                auto index0 = cmd->iterIndexes()->data()[0];
                auto index1 = cmd->iterIndexes()->data()[1];
                int32_t iter = 0;
                int32_t* iter0 = &iter;
                int32_t* iter1 = &iter;
                int32_t iter0Stride = 0;
                int32_t iter1Stride = 0;
                if (index0 >= 0) {
                    iter0 = originInputs[index0]->host<int32_t>();
                    iter0Stride = 1;
                }
                if (index1 >= 0) {
                    iter1 = originInputs[index1]->host<int32_t>();
                    iter1Stride = 1;
                }
                Tensor::InsideDescribe::Region reg;
                auto srcView = cmd->view()->GetAs<View>(1);
                auto dstView = cmd->view()->GetAs<View>(0);
                ::memcpy(reg.size, cmd->size()->data(), 3 * sizeof(int32_t));
                ::memcpy(reg.src.stride, srcView->stride()->data(), 3 * sizeof(int32_t));
                ::memcpy(reg.dst.stride, dstView->stride()->data(), 3 * sizeof(int32_t));
                auto input = mStack[cmd->indexes()->data()[1]];
                auto inputSize = input->elementSize();
                auto output = mStack[cmd->indexes()->data()[0]];
                auto bytes = input->getType().bytes();
                if (halide_type_float == input->getType().code) {
                    bytes = static_cast<CPUBackend*>(backend())->functions()->bytes;
                }
                auto step0 = cmd->steps()->data()[0];
                auto step1 = cmd->steps()->data()[1];
                auto loopNumber = mLoop->loopNumber();
                for (; iter<loopNumber; ++iter) {
                    auto srcIter = *(iter1 + iter1Stride * iter);
                    auto dstIter = *(iter0 + iter0Stride * iter);
                    auto srcOffset = srcIter * step1 + srcView->offset();
                    auto dstOffset = dstIter * step0 + dstView->offset();
                    if (dstOffset >= 0) {
                        if (srcOffset >= 0 && srcOffset < inputSize) {
                            _blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset);
                        } else {
                            _zero(reg, bytes, output->host<uint8_t>() + bytes * dstOffset);
                        }
                    }
                }
                return NO_ERROR;
            }
        }
        auto bytes = static_cast<CPUBackend*>(backend())->functions()->bytes;
        auto func = [&](int iter, int tId) {
            int fuseOutputStride[3];
            const int32_t* outputStride = nullptr;
            auto fuseBuffer = mFuseBuffer + mMaxFuseBufferSize * tId;
            for (int index=0; index<mLoop->commands()->size(); ++index) {
                auto cmd = mLoop->commands()->GetAs<RegionCommand>(index);
                auto blit = _selectUnitProc(bytes, cmd->view()->GetAs<View>(1)->stride()->data()[2], 1);
                auto op = cmd->op();
                int iterIndexsize = cmd->iterIndexes()->size();
                
                if (cmd->fuse() >= 0) {
                    outputStride = fuseOutputStride;
                    auto cmdSize = cmd->size()->data();
                    fuseOutputStride[0] = cmdSize[1] * cmdSize[2];
                    fuseOutputStride[1] = cmdSize[2];
                    fuseOutputStride[2] = 1;
                } else {
                    // Loop Op's command's first index must be output
                    outputStride = cmd->view()->GetAs<View>(0)->stride()->data();
                }
                halide_type_t inputType;
                for (int v=0; v<iterIndexsize; ++v) {
                    auto tensorIndex = cmd->indexes()->data()[v];
                    auto tensor = mStack[tensorIndex];
                    auto iterIndex = cmd->iterIndexes()->data()[v];
                    auto offset = iter;
                    if (1 == v) {
                        inputType = tensor->getType();
                    }
                    if (iterIndex >= 0) {
                        offset = mStack[iterIndex]->host<int32_t>()[iter];
                    }
                    auto view = cmd->view()->GetAs<View>(v);
                    offset = offset * cmd->steps()->data()[v] + view->offset();
                    mContainer[tId].stackPtr[tensorIndex] = tensor->host<uint8_t>() + offset * bytes;
                    MNN_ASSERT(nullptr != tensor->host<uint8_t>());
                }
                auto dstOrigin = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[0]];
                auto dst = dstOrigin;
                if (cmd->fuse() >= 0) {
                    dst = fuseBuffer.ptr();
                }
                do {
                    if (OpType_UnaryOp == op->type()) {
                        auto src = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[1]];
                        if (nullptr == op->main()) {
                            // Copy
                            Tensor::InsideDescribe::Region reg;
                            auto srcView = cmd->view()->GetAs<View>(1);
                            auto dstView = cmd->view()->GetAs<View>(0);
                            ::memcpy(reg.size, cmd->size()->data(), 3 * sizeof(int32_t));
                            ::memcpy(reg.src.stride, srcView->stride()->data(), 3 * sizeof(int32_t));
                            ::memcpy(reg.dst.stride, outputStride, 3 * sizeof(int32_t));
                            auto step0 = cmd->steps()->data()[0];
                            auto step1 = cmd->steps()->data()[1];
                            auto loopNumber = mLoop->loopNumber();
                            _blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst);
                            break;
                        }
                        auto proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectUnaryFunctionForFloat(op->main_as_UnaryOp()->opType(), static_cast<CPUBackend*>(backend())->precisionMode());
                        auto lastS = cmd->size()->data()[2];
                        if (lastS == 1 || cmd->view()->GetAs<View>(1)->stride()->data()[2] == 1) {
                            for (int z=0; z<cmd->size()->data()[0]; ++z) {
                                auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes;
                                auto dstZ = dst + z * outputStride[0] * bytes;
                                for (int y=0; y<cmd->size()->data()[1]; ++y) {
                                    auto srcY = srcZ + y * cmd->view()->GetAs<View>(1)->stride()->data()[1] * bytes;
                                    auto dstY = dstZ + y * outputStride[1] * bytes;
                                    proc(dstY, srcY, lastS);
                                }
                            }
                        } else {
                            // Blit to cache
                            auto srcCache = mCacheBuffer.ptr() + mMaxCacheSize * tId;
                            for (int z=0; z<cmd->size()->data()[0]; ++z) {
                                auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes;
                                auto dstZ = dst + z * outputStride[0] * bytes;
                                for (int y=0; y<cmd->size()->data()[1]; ++y) {
                                    auto srcY = srcZ + y * cmd->view()->GetAs<View>(1)->stride()->data()[1] * bytes;
                                    auto dstY = dstZ + y * outputStride[1] * bytes;
                                    blit(srcCache, srcY, lastS, cmd->view()->GetAs<View>(1)->stride()->data()[2], 1);
                                    proc(dstY, srcCache, lastS);
                                }
                            }
                        }
                        continue;
                    }
                    if (OpType_MatMul == op->type()) {
                        // TODO: Don't support fuse for matmul currently
                        const float* APtr = nullptr;
                        const float* BPtr = nullptr;
                        const float* BiasPtr = nullptr;
                        float* CPtr = (float*)dst;
                        auto exe = static_cast<CPUMatMul*>(mContainer[tId].exe[index].get());
                        APtr = (const float*)mContainer[tId].stackPtr[cmd->indexes()->data()[1]];
                        BPtr = (const float*)mContainer[tId].stackPtr[cmd->indexes()->data()[2]];
                        if (iterIndexsize > 3) {
                            BiasPtr = (const float*)mContainer[tId].stackPtr[cmd->indexes()->data()[3]];
                        }
                        exe->execute(APtr, BPtr, CPtr, BiasPtr);
                        break;
                    }
                    if (OpType_BinaryOp == op->type()) {
                        auto src0 = mContainer[tId].stackPtr[cmd->indexes()->data()[1]];
                        MNNBinaryExecute proc;
                        if (inputType.code == halide_type_float) {
                            proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectBinaryFunctionForFloat(op->main_as_BinaryOp()->opType());
                        } else {
                            MNN_ASSERT(inputType.code == halide_type_int);
                            proc = CPUBinary::selectForInt(op->main_as_BinaryOp()->opType());
                        }
                        auto lastS = cmd->size()->data()[2];
                        auto stride0 = outputStride;
                        auto stride1 = cmd->view()->GetAs<View>(1)->stride()->data();
                        MNN_ASSERT(stride0[2] == 1);
                        auto src1 = mContainer[tId].stackPtr[cmd->indexes()->data()[2]];
                        auto stride2 = cmd->view()->GetAs<View>(2)->stride()->data();
                        auto blit1   = _selectUnitProc(bytes, stride1[2], 1);
                        auto blit2   = _selectUnitProc(bytes, stride2[2], 1);
                        if (cmd->size()->data()[2] == 1 || (stride1[2] == 1 && stride2[2] == 1)) {
                            for (int z=0; z<cmd->size()->data()[0]; ++z) {
                                auto src0Z = src0 + z * stride1[0] * bytes;
                                auto src1Z = src1 + z * stride2[0] * bytes;
                                auto dstZ = dst + z * stride0[0] * bytes;
                                for (int y=0; y<cmd->size()->data()[1]; ++y) {
                                    auto src0Y = src0Z + y * stride1[1] * bytes;
                                    auto src1Y = src1Z + y * stride2[1] * bytes;
                                    auto dstY = dstZ + y * stride0[1] * bytes;
                                    proc(dstY, src0Y, src1Y, cmd->size()->data()[2], -1);
                                }
                            }
                        } else {
                            auto cache0 = mCacheBuffer.ptr() + mMaxCacheSize * tId;
                            auto cache1 = cache0 + cmd->size()->data()[2] * bytes;
                            for (int z=0; z<cmd->size()->data()[0]; ++z) {
                                auto src0Z = src0 + z * stride1[0] * bytes;
                                auto src1Z = src1 + z * stride2[0] * bytes;
                                auto dstZ = dst + z * stride0[0] * bytes;
                                for (int y=0; y<cmd->size()->data()[1]; ++y) {
                                    auto src0Y = src0Z + y * stride1[1] * bytes;
                                    auto src1Y = src1Z + y * stride2[1] * bytes;
                                    auto dstY = dstZ + y * stride0[1] * bytes;
                                    blit1(cache0, src0Y, cmd->size()->data()[2], stride1[2], 1);
                                    blit2(cache1, src1Y, cmd->size()->data()[2], stride2[2], 1);
                                    proc(dstY, cache0, cache1, cmd->size()->data()[2], -1);
                                }
                            }
                        }
                        break;
                    }
                } while(false);
                if (dst != dstOrigin) {
                    MNN_ASSERT(bytes == 4);
                    // Currently only support add and float32
                    auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
                    auto srcF = (const float*)dst;
                    auto dstF = (float*)dstOrigin;
                    int sizeZ = cmd->size()->data()[0];
                    int sizeY = cmd->size()->data()[1];
                    int sizeX = cmd->size()->data()[2];
                    if (cmd->op()->type() == OpType_MatMul) {
                        auto proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectBinaryFunctionForFloat(cmd->fuse());
                        proc(dstF, dstF, srcF, sizeZ * sizeX, -1);
                        continue;
                    }
                    switch (cmd->fuse()) {
                        case BinaryOpOperation_ADD:
                            for (int z=0; z<sizeZ; ++z) {
                                auto srcZ = srcF + z * outputStride[0];
                                auto dstZ = dstF + z * dstStride[0];
                                for (int y=0; y<sizeY; ++y) {
                                    auto srcY = srcZ + y * outputStride[1];
                                    auto dstY = dstZ + y * dstStride[1];
                                    for (int x=0; x<sizeX; ++x) {
                                        auto dstOffset = x * dstStride[2];
                                        dstY[dstOffset] = dstY[dstOffset] + srcY[x];
                                    }
                                }
                            }
                            break;
                        case BinaryOpOperation_MUL:
                            for (int z=0; z<sizeZ; ++z) {
                                auto srcZ = srcF + z * dstStride[0];
                                auto dstZ = dstF + z * outputStride[0];
                                for (int y=0; y<sizeY; ++y) {
                                    auto srcY = srcZ + z * dstStride[1];
                                    auto dstY = dstZ + z * outputStride[1];
                                    for (int x=0; x<sizeX; ++x) {
                                        auto dstOffset = x * dstStride[2];
                                        dstY[dstOffset] = dstY[dstOffset] * srcY[x];
                                    }
                                }
                            }
                            break;
                        case BinaryOpOperation_SUB:
                            for (int z=0; z<sizeZ; ++z) {
                                auto srcZ = srcF + z * dstStride[0];
                                auto dstZ = dstF + z * outputStride[0];
                                for (int y=0; y<sizeY; ++y) {
                                    auto srcY = srcZ + z * dstStride[1];
                                    auto dstY = dstZ + z * outputStride[1];
                                    for (int x=0; x<sizeX; ++x) {
                                        auto dstOffset = x * dstStride[2];
                                        auto D = dstY[dstOffset];
                                        auto S = srcY[x];
                                        dstY[dstOffset] = D - S;
                                    }
                                }
                            }
                            break;
                        default:
                            break;
                    }
                }
            }
        };
        if (mLoop->parallel()) {
            MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
                for (int iter=tId; iter < mLoop->loopNumber(); iter+=threadNumber) {
                    func(iter, tId);
                }
            }
            MNN_CONCURRENCY_END();
        } else {
            for (int iter=0; iter < mLoop->loopNumber(); ++iter) {
                func(iter, 0);
            }
        }
        return NO_ERROR;
    }
}


相关实践学习
部署Stable Diffusion玩转AI绘画(GPU云服务器)
本实验通过在ECS上从零开始部署Stable Diffusion来进行AI绘画创作,开启AIGC盲盒。
目录
相关文章
|
7月前
|
并行计算 TensorFlow 调度
推荐场景GPU优化的探索与实践:CUDA Graph与多流并行的比较与分析
RTP 系统(即 Rank Service),是一个面向搜索和推荐的 ranking 需求,支持多种模型的在线 inference 服务,是阿里智能引擎团队沉淀多年的技术产品。今年,团队在推荐场景的GPU性能优化上又做了新尝试——在RTP上集成了Multi Stream,改变了TensorFlow的单流机制,让多流的执行并行,作为增加GPU并行度的另一种选择。本文详细介绍与比较了CUDA Graph与多流并行这两个方案,以及团队的实践成果与心得。
|
4月前
|
存储 缓存 openCL
MNN Session 之 Vulkan 算子(八)
MNN Session 之 Vulkan 算子(八)
41 2
|
24天前
|
监控 PyTorch 数据处理
通过pin_memory 优化 PyTorch 数据加载和传输:工作原理、使用场景与性能分析
在 PyTorch 中,`pin_memory` 是一个重要的设置,可以显著提高 CPU 与 GPU 之间的数据传输速度。当 `pin_memory=True` 时,数据会被固定在 CPU 的 RAM 中,从而加快传输到 GPU 的速度。这对于处理大规模数据集、实时推理和多 GPU 训练等任务尤为重要。本文详细探讨了 `pin_memory` 的作用、工作原理及最佳实践,帮助你优化数据加载和传输,提升模型性能。
54 4
通过pin_memory 优化 PyTorch 数据加载和传输:工作原理、使用场景与性能分析
|
2月前
|
调度 异构计算
NVIDIA Triton系列10-模型并发执行
NVIDIA Triton服务器支持模型并发执行,通过在单个或多个GPU上同时运行多个模型实例,提高计算资源利用率和性能。配置`instance_group`可调整每个模型的并发实例数,优化推理效率。此外,通过设置资源限制和优先级,确保在有限的计算资源下实现高效的任务调度。
57 0
NVIDIA Triton系列10-模型并发执行
|
4月前
MNN Session 之维度计算(五)
MNN Session 之维度计算(五)
25 3
|
4月前
MNN Session 之几何计算(六)
MNN Session 之几何计算(六)
30 6
|
7月前
|
并行计算 PyTorch 算法框架/工具
基于mps的pytorch 多实例并行推理
基于mps的pytorch 多实例并行推理
332 1
|
机器学习/深度学习 并行计算 数据可视化
PyTorch自定义CUDA算子教程与运行时间分析
PyTorch自定义CUDA算子教程与运行时间分析
315 0
|
机器学习/深度学习 存储 网络协议
PyTorch 并行训练 DistributedDataParallel完整代码示例
使用大型数据集训练大型深度神经网络 (DNN) 的问题是深度学习领域的主要挑战。 随着 DNN 和数据集规模的增加,训练这些模型的计算和内存需求也会增加。 这使得在计算资源有限的单台机器上训练这些模型变得困难甚至不可能。 使用大型数据集训练大型 DNN 的一些主要挑战包括:
328 0
|
机器学习/深度学习 并行计算 算法
在PyTorch中使用DistributedDataParallel进行多GPU分布式模型训练
在PyTorch中使用DistributedDataParallel进行多GPU分布式模型训练
1007 0
在PyTorch中使用DistributedDataParallel进行多GPU分布式模型训练