CUDA学习(五十二)

简介:

例子:
每个线程的分配:

#include <stdlib.h>
#include <stdio.h>
__global__ void mallocTest()
{
    size_t size = 123;
    char* ptr = (char*)malloc(size);
    memset(ptr, 0, size);
    printf("Thread %d got pointer: %p\n", threadIdx.x, ptr);
    free(ptr);
}
int main()
{
    // Set a heap size of 128 megabytes. Note that this must
    // be done before any kernel is launched.
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * 1024 * 1024);
    mallocTest << <1, 5 >> >();
    cudaDeviceSynchronize();
    return 0;
}

结果:

Thread 0 got pointer: 00057020
Thread 1 got pointer: 0005708c
Thread 2 got pointer: 000570f8
Thread 3 got pointer: 00057164
Thread 4 got pointer: 000571d0

注意每个线程如何遇到malloc()和memset()命令,并接收并初始化自己的分配。 (精确的指针值会有所不同:这些是说明性的。)
按线程块分配:

#include <stdlib.h>
__global__ void mallocTest()
{
    __shared__ int* data;
    // The first thread in the block does the allocation and then
    // shares the pointer with all other threads through shared memory,
    // so that access can easily be coalesced.
    // 64 bytes per thread are allocated.
    if (threadIdx.x == 0) {
        size_t size = blockDim.x * 64;
        data = (int*)malloc(size);
    }
    __syncthreads();
    // Check for failure
    if (data == NULL)
        return;
    // Threads index into the memory, ensuring coalescence
    int* ptr = data;
    for (int i = 0; i < 64; ++i)
        ptr[i * blockDim.x + threadIdx.x] = threadIdx.x;
    // Ensure all threads complete before freeing
    __syncthreads();
    // Only one thread may free the memory!
    if (threadIdx.x == 0)
        free(data);
}
int main()
{
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * 1024 * 1024);
    mallocTest << <10, 128 >> >();
    cudaDeviceSynchronize();
    getchar();
    return 0;
}

内核启动之间的分配持久性:

#include <stdlib.h>
#include <stdio.h>
#define NUM_BLOCKS 20
__device__ int* dataptr[NUM_BLOCKS]; // Per-block pointer
__global__ void allocmem()
{
    // Only the first thread in the block does the allocation
    // since we want only one allocation per block.
    if (threadIdx.x == 0)
        dataptr[blockIdx.x] = (int*)malloc(blockDim.x * 4);
    __syncthreads();
    // Check for failure
    if (dataptr[blockIdx.x] == NULL)
        return;
    // Zero the data with all threads in parallel
    dataptr[blockIdx.x][threadIdx.x] = 0;
}
// Simple example: store thread ID into each element
__global__ void usemem()
{
    int* ptr = dataptr[blockIdx.x];
    if (ptr != NULL)
        ptr[threadIdx.x] += threadIdx.x;
}
// Print the content of the buffer before freeing it
__global__ void freemem()
{
    int* ptr = dataptr[blockIdx.x];
    if (ptr != NULL)
        printf("Block %d, Thread %d: final value = %d\n",
            blockIdx.x, threadIdx.x, ptr[threadIdx.x]);
    // Only free from one thread!
    if (threadIdx.x == 0)
        free(ptr);
}
int main()
{
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * 1024 * 1024);
    // Allocate memory
    allocmem << < NUM_BLOCKS, 10 >> >();
    // Use memory
    usemem << < NUM_BLOCKS, 10 >> >();
    usemem << < NUM_BLOCKS, 10 >> >();
    usemem << < NUM_BLOCKS, 10 >> >();
    // Free memory
    freemem << < NUM_BLOCKS, 10 >> >();
    cudaDeviceSynchronize();
    return 0;
}

timg

目录
相关文章
|
并行计算 索引
|
并行计算 API 调度
CUDA学习(八十八)
3.虽然__syncthreads()一直被记录为同步线程块中的所有线程,但Pascal和以前的体系结构只能在warp级别强制执行同步。 在某些情况下,只要每条经线中至少有一条线达到屏障,就可以在不被每条线执行的情况下成功实现屏障。
1738 0
|
并行计算 C语言 编译器
|
并行计算 API 编译器
CUDA学习(六十五)
很早之前就发现云栖社区的编辑器有一个Bug,往草稿箱存博客,当草稿箱博客数超过十篇时,无法再选择十篇前的博客进行编辑
2398 0
|
并行计算 算法
|
并行计算 算法 异构计算
|
并行计算 API
|
并行计算 API
|
存储 并行计算 API