CUDA学习（五十二）-阿里云开发者社区

CUDA学习（五十二）

2018-02-19 1361

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

简介：

例子：
每个线程的分配：

#include <stdlib.h>
#include <stdio.h>
__global__ void mallocTest()
{
    size_t size = 123;
    char* ptr = (char*)malloc(size);
    memset(ptr, 0, size);
    printf("Thread %d got pointer: %p\n", threadIdx.x, ptr);
    free(ptr);
}
int main()
{
    // Set a heap size of 128 megabytes. Note that this must
    // be done before any kernel is launched.
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * 1024 * 1024);
    mallocTest << <1, 5 >> >();
    cudaDeviceSynchronize();
    return 0;
}

结果：

Thread 0 got pointer: 00057020
Thread 1 got pointer: 0005708c
Thread 2 got pointer: 000570f8
Thread 3 got pointer: 00057164
Thread 4 got pointer: 000571d0

注意每个线程如何遇到malloc（）和memset（）命令，并接收并初始化自己的分配。（精确的指针值会有所不同：这些是说明性的。）
按线程块分配：

#include <stdlib.h>
__global__ void mallocTest()
{
    __shared__ int* data;
    // The first thread in the block does the allocation and then
    // shares the pointer with all other threads through shared memory,
    // so that access can easily be coalesced.
    // 64 bytes per thread are allocated.
    if (threadIdx.x == 0) {
        size_t size = blockDim.x * 64;
        data = (int*)malloc(size);
    }
    __syncthreads();
    // Check for failure
    if (data == NULL)
        return;
    // Threads index into the memory, ensuring coalescence
    int* ptr = data;
    for (int i = 0; i < 64; ++i)
        ptr[i * blockDim.x + threadIdx.x] = threadIdx.x;
    // Ensure all threads complete before freeing
    __syncthreads();
    // Only one thread may free the memory!
    if (threadIdx.x == 0)
        free(data);
}
int main()
{
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * 1024 * 1024);
    mallocTest << <10, 128 >> >();
    cudaDeviceSynchronize();
    getchar();
    return 0;
}

内核启动之间的分配持久性：

#include <stdlib.h>
#include <stdio.h>
#define NUM_BLOCKS 20
__device__ int* dataptr[NUM_BLOCKS]; // Per-block pointer
__global__ void allocmem()
{
    // Only the first thread in the block does the allocation
    // since we want only one allocation per block.
    if (threadIdx.x == 0)
        dataptr[blockIdx.x] = (int*)malloc(blockDim.x * 4);
    __syncthreads();
    // Check for failure
    if (dataptr[blockIdx.x] == NULL)
        return;
    // Zero the data with all threads in parallel
    dataptr[blockIdx.x][threadIdx.x] = 0;
}
// Simple example: store thread ID into each element
__global__ void usemem()
{
    int* ptr = dataptr[blockIdx.x];
    if (ptr != NULL)
        ptr[threadIdx.x] += threadIdx.x;
}
// Print the content of the buffer before freeing it
__global__ void freemem()
{
    int* ptr = dataptr[blockIdx.x];
    if (ptr != NULL)
        printf("Block %d, Thread %d: final value = %d\n",
            blockIdx.x, threadIdx.x, ptr[threadIdx.x]);
    // Only free from one thread!
    if (threadIdx.x == 0)
        free(ptr);
}
int main()
{
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * 1024 * 1024);
    // Allocate memory
    allocmem << < NUM_BLOCKS, 10 >> >();
    // Use memory
    usemem << < NUM_BLOCKS, 10 >> >();
    usemem << < NUM_BLOCKS, 10 >> >();
    usemem << < NUM_BLOCKS, 10 >> >();
    // Free memory
    freemem << < NUM_BLOCKS, 10 >> >();
    cudaDeviceSynchronize();
    return 0;
}

timg

CUDA学习（五十二）

热门文章

最新文章

相关课程

相关电子书