CUDA学习(二十三)

简介:

Direct3D互操作性:
Direct3D 9Ex,Direct3D 10和Direct3D 11支持Direct3D互操作性。
CUDA上下文只能与满足以下条件的Direct3D设备互操作:必须使用设置为D3DDEVTYPE_HAL的DeviceType和具有D3DCREATE_HARDWARE_VERTEXPROCESSING标志的BehaviorFlags创建Direct3D 9Ex设备; 必须使用设置为D3D_DRIVER_TYPE_HARDWARE的DriverType创建Direct3D 10和Direct3D 11设备。
可以映射到CUDA地址空间的Direct3D资源是Direct3D缓冲区,纹理和曲面。 这些资源是使用cudaGraphicsD3D9RegisterResource(),cudaGraphicsD3D10RegisterResource()和cudaGraphicsD3D11RegisterResource()注册的。
以下代码示例使用内核动态修改存储在顶点缓冲区对象中的顶点的2D宽度x高度网格:
Direct3D 9 版本:

IDirect3D9* D3D;
IDirect3DDevice9* device;
struct CUSTOMVERTEX {
    FLOAT x, y, z;
    DWORD color;
};
IDirect3DVertexBuffer9* positionsVB;
struct cudaGraphicsResource* positionsVB_CUDA;
int main()
{
    int dev;
    // Initialize Direct3D
    D3D = Direct3DCreate9Ex(D3D_SDK_VERSION);
    // Get a CUDA-enabled adapter
    unsigned int adapter = 0;
    for (; adapter < g_pD3D->GetAdapterCount(); adapter++) {
        D3DADAPTER_IDENTIFIER9 adapterId;
        g_pD3D->GetAdapterIdentifier(adapter, 0, &adapterId);
        if (cudaD3D9GetDevice(&dev, adapterId.DeviceName)
            == cudaSuccess)
            break;
    }
    // Create device
    ...
        D3D->CreateDeviceEx(adapter, D3DDEVTYPE_HAL, hWnd,
            D3DCREATE_HARDWARE_VERTEXPROCESSING,
            &params, NULL, &device);
    // Use the same device
    cudaSetDevice(dev);
    // Create vertex buffer and register it with CUDA
    unsigned int size = width * height * sizeof(CUSTOMVERTEX);
    device->CreateVertexBuffer(size, 0, D3DFVF_CUSTOMVERTEX,
        D3DPOOL_DEFAULT, &positionsVB, 0);
    cudaGraphicsD3D9RegisterResource(&positionsVB_CUDA,
        positionsVB,
        cudaGraphicsRegisterFlagsNone);
    cudaGraphicsResourceSetMapFlags(positionsVB_CUDA,
        cudaGraphicsMapFlagsWriteDiscard);
    // Launch rendering loop
    while (...) {
        ...
            Render();
        ...
    }
    ...
}
void Render()
{
    // Map vertex buffer for writing from CUDA
    float4* positions;
    cudaGraphicsMapResources(1, &positionsVB_CUDA, 0);
    size_t num_bytes;
    cudaGraphicsResourceGetMappedPointer((void**)&positions,
        &num_bytes,
        positionsVB_CUDA));
        // Execute kernel
        dim3 dimBlock(16, 16, 1);
        dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
        createVertices << <dimGrid, dimBlock >> >(positions, time,
            width, height);
        // Unmap vertex buffer
        cudaGraphicsUnmapResources(1, &positionsVB_CUDA, 0);
        // Draw and present
        ...
}
void releaseVB()
{
    cudaGraphicsUnregisterResource(positionsVB_CUDA);
    positionsVB->Release();
}
__global__ void createVertices(float4* positions, float time,
    unsigned int width, unsigned int height)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    // Calculate uv coordinates
    float u = x / (float)width;
    float v = y / (float)height;
    u = u * 2.0f - 1.0f;
    v = v * 2.0f - 1.0f;
    // Calculate simple sine wave pattern
    float freq = 4.0f;
    float w = sinf(u * freq + time)
        * cosf(v * freq + time) * 0.5f;
    // Write positions
    positions[y * width + x] =
        make_float4(u, w, v, __int_as_float(0xff00ff00));
}

Direct3D 10 版本:

ID3D10Device* device;
struct CUSTOMVERTEX {
    FLOAT x, y, z;
    DWORD color;
};
ID3D10Buffer* positionsVB;
struct cudaGraphicsResource* positionsVB_CUDA;
int main()
{
    int dev;
    // Get a CUDA-enabled adapter
    IDXGIFactory* factory;
    CreateDXGIFactory(__uuidof(IDXGIFactory), (void**)&factory);
    IDXGIAdapter* adapter = 0;
    for (unsigned int i = 0; !adapter; ++i) {
        if (FAILED(factory->EnumAdapters(i, &adapter))
            break;
        if (cudaD3D10GetDevice(&dev, adapter) == cudaSuccess)
            break;
        adapter->Release();
    }
    factory->Release();
    // Create swap chain and device
    ...
        D3D10CreateDeviceAndSwapChain(adapter,
            D3D10_DRIVER_TYPE_HARDWARE, 0,
            D3D10_CREATE_DEVICE_DEBUG,
            D3D10_SDK_VERSION,
            &swapChainDesc, &swapChain,
            &device);
    adapter->Release();
    // Use the same device
    cudaSetDevice(dev);
    // Create vertex buffer and register it with CUDA
    unsigned int size = width * height * sizeof(CUSTOMVERTEX);
    D3D10_BUFFER_DESC bufferDesc;
    bufferDesc.Usage = D3D10_USAGE_DEFAULT;
    bufferDesc.ByteWidth = size;
    bufferDesc.BindFlags = D3D10_BIND_VERTEX_BUFFER;
    bufferDesc.CPUAccessFlags = 0;
    bufferDesc.MiscFlags = 0;
    device->CreateBuffer(&bufferDesc, 0, &positionsVB);
    cudaGraphicsD3D10RegisterResource(&positionsVB_CUDA,
        positionsVB,
        cudaGraphicsRegisterFlagsNone);
    cudaGraphicsResourceSetMapFlags(positionsVB_CUDA,
        cudaGraphicsMapFlagsWriteDiscard);
    // Launch rendering loop
    while (...) {
        ...
            Render();
        ...
    }
    ...
}
void Render()
{
    // Map vertex buffer for writing from CUDA
    float4* positions;
    cudaGraphicsMapResources(1, &positionsVB_CUDA, 0);
    size_t num_bytes;
    cudaGraphicsResourceGetMappedPointer((void**)&positions,
        &num_bytes,
        positionsVB_CUDA));
        // Execute kernel
        dim3 dimBlock(16, 16, 1);
        dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
        createVertices << <dimGrid, dimBlock >> >(positions, time,
            width, height);
        // Unmap vertex buffer
        cudaGraphicsUnmapResources(1, &positionsVB_CUDA, 0);
        // Draw and present
        ...
}
void releaseVB()
{
    cudaGraphicsUnregisterResource(positionsVB_CUDA);
    positionsVB->Release();
}
__global__ void createVertices(float4* positions, float time,
    unsigned int width, unsigned int height)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    // Calculate uv coordinates
    float u = x / (float)width;
    float v = y / (float)height;
    u = u * 2.0f - 1.0f;
    v = v * 2.0f - 1.0f;
    // Calculate simple sine wave pattern
    float freq = 4.0f;
    float w = sinf(u * freq + time)
        * cosf(v * freq + time) * 0.5f;
    // Write positions
    positions[y * width + x] =
        make_float4(u, w, v, __int_as_float(0xff00ff00));
}

Direct3D 11版本

ID3D11Device* device;
struct CUSTOMVERTEX {
    FLOAT x, y, z;
    DWORD color;
};
ID3D11Buffer* positionsVB;
struct cudaGraphicsResource* positionsVB_CUDA;
int main()
{
    int dev;
    // Get a CUDA-enabled adapter
    IDXGIFactory* factory;
    CreateDXGIFactory(__uuidof(IDXGIFactory), (void**)&factory);
    IDXGIAdapter* adapter = 0;
    for (unsigned int i = 0; !adapter; ++i) {
        if (FAILED(factory->EnumAdapters(i, &adapter))
            break;
        if (cudaD3D11GetDevice(&dev, adapter) == cudaSuccess)
            break;
        adapter->Release();
    }
    factory->Release();
    // Create swap chain and device
    ...
        sFnPtr_D3D11CreateDeviceAndSwapChain(adapter,
            D3D11_DRIVER_TYPE_HARDWARE,
            0,
            D3D11_CREATE_DEVICE_DEBUG,
            featureLevels, 3,
            D3D11_SDK_VERSION,
            &swapChainDesc, &swapChain,
            &device,
            &featureLevel,
            &deviceContext);
    adapter->Release();
    // Use the same device
    cudaSetDevice(dev);
    // Create vertex buffer and register it with CUDA
    unsigned int size = width * height * sizeof(CUSTOMVERTEX);
    D3D11_BUFFER_DESC bufferDesc;
    bufferDesc.Usage = D3D11_USAGE_DEFAULT;
    bufferDesc.ByteWidth = size;
    bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
    bufferDesc.CPUAccessFlags = 0;
    bufferDesc.MiscFlags = 0;
    device->CreateBuffer(&bufferDesc, 0, &positionsVB);
    cudaGraphicsD3D11RegisterResource(&positionsVB_CUDA,
        positionsVB,
        cudaGraphicsRegisterFlagsNone);
    cudaGraphicsResourceSetMapFlags(positionsVB_CUDA,
        cudaGraphicsMapFlagsWriteDiscard);
    // Launch rendering loop
    while (...) {
        ...
            Render();
        ...
    }
    ...
}
void Render()
{
    // Map vertex buffer for writing from CUDA
    float4* positions;
    cudaGraphicsMapResources(1, &positionsVB_CUDA, 0);
    size_t num_bytes;
    cudaGraphicsResourceGetMappedPointer((void**)&positions,
        &num_bytes,
        positionsVB_CUDA));
        // Execute kernel
        dim3 dimBlock(16, 16, 1);
        dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
        createVertices << <dimGrid, dimBlock >> >(positions, time,
            width, height);
        // Unmap vertex buffer
        cudaGraphicsUnmapResources(1, &positionsVB_CUDA, 0);
        // Draw and present
        ...
}
void releaseVB()
{
    cudaGraphicsUnregisterResource(positionsVB_CUDA);
    positionsVB->Release();
}
__global__ void createVertices(float4* positions, float time,
    unsigned int width, unsigned int height)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    // Calculate uv coordinates
    float u = x / (float)width;
    float v = y / (float)height;
    u = u * 2.0f - 1.0f;
    v = v * 2.0f - 1.0f;
    // Calculate simple sine wave pattern
    float freq = 4.0f;
    float w = sinf(u * freq + time)
        * cosf(v * freq + time) * 0.5f;
    // Write positions
    positions[y * width + x] =
        make_float4(u, w, v, __int_as_float(0xff00ff00));
}

SLI互操作性;
在具有多个GPU的系统中,所有支持CUDA的GPU都可以通过CUDA驱动程序和运行时作为单独的设备访问。 当系统处于SLI模式时,有以下特殊的考虑:
首先,在一个GPU上的一个CUDA设备中的分配会消耗作为Direct3D或OpenGL设备的SLI配置的一部分的其他GPU上的存储器。 因此,分配可能会比预期的更早失败;
其次,应用程序应该为SLI配置中的每个GPU创建多个CUDA上下文。 虽然这不是一个严格的要求,但它避免了设备之间不必要的数据传输。 应用程序可以使用Direct3D的cudaD3D [GetDevices()]和OpenGL的cudaGLGetDevices()函数来标识正在当前执行渲染的设备的CUDA设备句柄 和下一帧。 给定此信息,当deviceList参数设置为cudaD3D [9 | 10] GetDevices()或cudaGLGetDevices()时,应用程序通常会选择合适的设备并将Direct3D或OpenGL资源映射到cudaD3D [9 | 10] 11返回的CUDA设备 DeviceListCurrentFrame或cudaGLDeviceListCurrentFrame。
请注意,从cudaGraphicsD9D [9 | 10 | 11] RegisterResource和cudaGraphicsGLRegister [Buffer | Image]只能在设备上使用。 因此,在SLI配置中,在不同的CUDA设备上计算不同帧的数据时,需要分别注册资源.
有关CUDA运行时如何分别与Direct3D和OpenGL互操作的详细信息,请参见Direct3D互操作性和OpenGL互操作性。;
timg

相关实践学习
基于阿里云DeepGPU实例,用AI画唯美国风少女
本实验基于阿里云DeepGPU实例,使用aiacctorch加速stable-diffusion-webui,用AI画唯美国风少女,可提升性能至高至原性能的2.6倍。
目录
相关文章
|
并行计算 异构计算
|
并行计算 异构计算 数据管理
|
并行计算 调度
|
并行计算 异构计算 API
|
并行计算 API
|
缓存 并行计算 编译器
|
并行计算 编译器 C语言
|
并行计算 算法
CUDA学习(二十七)
终于结束了CUDA关于纹理内存的介绍,接下来的应该会GPU对原程序的性能优化有关
1521 0