CUDA Coding Practice

Img401
  • Compiling a program for CUDA

    • For example, to compile MyProg.cu you would use a command like

    • nvcc -o MyProg MyProg.cu

Start
Code Start with GPU
// Note that in this model we do not check
// the error codes and status of kernel call.

#include <cstdio>
#include <cmath>

__global__ void hello()
{
  printf("Greetings from your GPU\n");
}


int main(void)
{
  int count, device;
  cudaGetDeviceCount(&count);
  cudaGetDevice(&device);
  printf("You have in total %d GPUs in your system\n", count);
  printf("GPU %d will now print a message for you:\n", device);

  hello<<<1,1>>>();
  cudaDeviceSynchronize();
  
  return 0;	
}
Array Addition Examples in CPU/GPU
  • Array Addition Examples in CPU/GPU

Code Array Addition CPU
//
// gcc  01_array_addition_cpu.cpp
// nvcc 01_array_addition_cpu.cpp
//
#include <math.h>
#include <stdlib.h>
#include <stdio.h>

const double EPSILON = 1.0E-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;

void array_addition(const double *vecA, const double *vecB, double *vecC, const int NX);
void array_check(const double *vecC, const int NX);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n");

    const int NX = 1000000;
	int size_array = sizeof(double) * NX;
    double *vecA = (double *)malloc(size_array);
    double *vecB = (double *)malloc(size_array);
    double *vecC = (double *)malloc(size_array);

    for (int i = 0; i < NX; i++)
    {
        vecA[i] = a;
        vecB[i] = b;
    }

    array_addition(vecA, vecB, vecC, NX);
    array_check(vecC, NX);

    free(vecA);
    free(vecB);
    free(vecC);

	printf("\n--Ending of the main function.\n\n");

    return 0;
}


void array_addition(const double *vecA, const double *vecB, double *vecC, const int NX)
{
    for (int i = 0; i < NX; i++)
        vecC[i] = vecA[i] + vecB[i];
}


void array_check(const double *vecC, const int NX)
{
    bool has_error = false;
    for (int i = 0; i < NX; i++)
    {
        if (fabs(vecC[i] - c) > EPSILON)
		{
            has_error = true;
			break;
		}
    }
    printf("\n\tChecking array addition results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
Code Array Addition GPU
//
// nvcc 02_array_addition_gpu.cu
//
#include <math.h>
#include <stdio.h>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;

void __global__ array_addition(const double *vecA, const double *vecB, double *vecC);
void array_check(const double *vecC, const int NX);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n");

    const int NX = 25600004; // try 25600000, 25600004 and 25600008
	int size_array = sizeof(double) * NX;

    double *h_vecA = (double *)malloc(size_array); // 'h' for host (CPU)
    double *h_vecB = (double *)malloc(size_array);
    double *h_vecC = (double *)malloc(size_array);

    for (int i = 0; i < NX; i++)
    {
        h_vecA[i] = a;
        h_vecB[i] = b;
    }

    double *d_vecA, *d_vecB, *d_vecC; // 'd' for device (GPU)
    cudaMalloc((void **)&d_vecA, size_array);
    cudaMalloc((void **)&d_vecB, size_array);
    cudaMalloc((void **)&d_vecC, size_array);
    cudaMemcpy(d_vecA, h_vecA, size_array, cudaMemcpyHostToDevice); // copy data from host(CPU) to device(GPU)
    cudaMemcpy(d_vecB, h_vecB, size_array, cudaMemcpyHostToDevice);

    const int block_size = 128;
    int grid_size = (NX + block_size - 1) / block_size;

	printf("\n\tArray_size = %d, grid_size = %d and block_size = %d.\n", NX, grid_size, block_size);
    array_addition<<<grid_size, block_size>>>(d_vecA, d_vecB, d_vecC);

    cudaMemcpy(h_vecC, d_vecC, size_array, cudaMemcpyDeviceToHost); // copy data from device(GPU) to host(CPU)
    array_check(h_vecC, NX);

    free(h_vecA);
    free(h_vecB);
    free(h_vecC);
    cudaFree(d_vecA);
    cudaFree(d_vecB);
    cudaFree(d_vecC);

	printf("\n--Ending of the main function.\n\n");

    return 0;
}


void __global__ array_addition(const double *vecA, const double *vecB, double *vecC)
{
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    vecC[i] = vecA[i] + vecB[i];
}


void array_check(const double *vecC, const int NX)
{
    bool has_error = false;
    for (int i = 0; i < NX; i++)
    {
        if (fabs(vecC[i] - c) > EPSILON)
		{
            has_error = true;
			break;
		}
    }
    printf("\n\tChecking array addition results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
Profiling Performance

ADD SOME RESULTS

Array Reduce Examples in CPU
  • Array Reduce Examples in CPU/GPU

A reduction of an array to a single value on a GPU. Data is copied to the GPU memory, where each thread adds one element to the accumulated value. The thread-safe atomic operations have to be used in order to ensure that there are no race conditions. Many threads will run simultaneously on a GPU, so there is no need for a loop over the indices.

Code Array Reduce CPU
//
// nvcc -O3 -DUSE_DP 01_array_reduce_cpu.cu 
// nvcc -O3          01_array_reduce_cpu.cu
// 
#include "error_checker.cuh"
#include <stdio.h>

#ifdef USE_DP
    typedef double real; // for double precision
#else
    typedef float real;  // for single precision
#endif

const int NUM_REPEATS = 20;

void timing(const real *x, const int NX);
real reduce(const real *x, const int NX);

int main(int argc, const char * argv[])
{

    printf("\n--Beginning of the main function.\n");
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("\nUsing Device %d: %s\n", dev, deviceProp.name);

    const int NX = 100000000;
    const int size_array = sizeof(real) * NX;
    real *x = (real *) malloc(size_array);
    for (int n = 0; n < NX; ++n)
        x[n] = 1.23;

    timing(x, NX);

    free(x);

    printf("\n--Ending of the main function.\n\n");
    return 0;
}


void timing(const real *x, const int NX)
{
    real sum = 0;
    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(x, NX);

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("\tTime = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }
    printf("\tSum = %f.\n", sum);
}


real reduce(const real *x, const int NX)
{
    real sum = 0.0;
    for (int n = 0; n < NX; ++n)
        sum += x[n];
    return sum;
}
Code Array Reduce GPU
//
// nvcc -O3 -DUSE_DP 02_array_reduce_gpu.cu
// nvcc -O3          02_array_reduce_gpu.cu
//
#include "error_checker.cuh"
#include <stdio.h>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 20;
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
const int BLOCK_SIZE = 128;

void timing(real *h_x, real *d_x, const int method);

int main(int argc, const char * argv[])
{

    printf("\n--Beginning of the main function.\n");
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("\nUsing Device %d: %s\n", dev, deviceProp.name);

    real *h_x = (real *)malloc(size_array);
    for (int n = 0; n < NX; ++n)
        h_x[n] = 1.23;
    real *d_x;
    CHECK(cudaMalloc(&d_x, size_array));

    printf("\n\tUsing global memory only:\n");
    timing(h_x, d_x, 0);
    printf("\n\tUsing static shared memory:\n");
    timing(h_x, d_x, 1);
    printf("\n\tUsing dynamic shared memory:\n");
    timing(h_x, d_x, 2);

    free(h_x);
    CHECK(cudaFree(d_x));

    printf("\n--Ending of the main function.\n\n");
    return 0;
}


void __global__ reduction_global(real *d_x, real *d_y)
{
    const int tid = threadIdx.x;
    real *x = d_x + blockDim.x * blockIdx.x;

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {
        if (tid < offset)
            x[tid] += x[tid + offset];
        __syncthreads();
    }
    if (tid == 0)
        d_y[blockIdx.x] = x[0];
}


void __global__ reduction_static_shared(real *d_x, real *d_y)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    __shared__ real s_y[128];
    s_y[tid] = (n < NX) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {

        if (tid < offset)
            s_y[tid] += s_y[tid + offset];
        __syncthreads();
    }

    if (tid == 0)
        d_y[bid] = s_y[0];
}

void __global__ reduction_dynamic_shared(real *d_x, real *d_y)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < NX) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {
        if (tid < offset)
            s_y[tid] += s_y[tid + offset];
        __syncthreads();
    }

    if (tid == 0)
        d_y[bid] = s_y[0];
}


real reduction(real *d_x, const int method)
{
    int grid_size = (NX + BLOCK_SIZE - 1) / BLOCK_SIZE;
    const int ymem = sizeof(real) * grid_size;
    const int smem = sizeof(real) * BLOCK_SIZE;
    real *d_y;
    CHECK(cudaMalloc(&d_y, ymem));
    real *h_y = (real *) malloc(ymem);

    switch (method)
    {
        case 0:
            reduction_global<<<grid_size, BLOCK_SIZE>>>(d_x, d_y); break;
        case 1:
            reduction_static_shared<<<grid_size, BLOCK_SIZE>>>(d_x, d_y); break;
        case 2:
            reduction_dynamic_shared<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y); break;
        default:
            printf("Error: wrong method\n");
            exit(1);
            break;
    }

    CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));

    real result = 0.0;
    for (int n = 0; n < grid_size; ++n)
        result += h_y[n];

    free(h_y);
    CHECK(cudaFree(d_y));
    return result;
}


void timing(real *h_x, real *d_x, const int method)
{
    real sum = 0;
    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        CHECK(cudaMemcpy(d_x, h_x, size_array, cudaMemcpyHostToDevice));

        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduction(d_x, method);

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("\tTime = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("\tSum = %f.\n", sum);
}
Code Array Reduce Atomic GPU
//
// nvcc -O3 -DUSE_DP 03_array_reduce_gpu_atomic.cu
// nvcc -O3          03_array_reduce_gpu_atomic.cu
//
#include "error_checker.cuh"
#include <stdio.h>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 20;
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
const int BLOCK_SIZE = 128;

void timing(const real *d_x);

int main(int argc, const char * argv[])
{

    printf("\n--Beginning of the main function.\n");
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("\nUsing Device %d: %s\n", dev, deviceProp.name);

    real *h_x = (real *) malloc(size_array);
    for (int n = 0; n < NX; ++n)
        h_x[n] = 1.23;
    real *d_x;
    CHECK(cudaMalloc(&d_x, size_array));
    CHECK(cudaMemcpy(d_x, h_x, size_array, cudaMemcpyHostToDevice));

    printf("\n\tUsing atomicAdd:\n");
    timing(d_x);

    free(h_x);
    CHECK(cudaFree(d_x));

    printf("\n--Ending of the main function.\n\n");
    return 0;
}


void __global__ reduce(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < NX) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {
        if (tid < offset)
            s_y[tid] += s_y[tid + offset];
        __syncthreads();
    }

    if (tid == 0)
        atomicAdd(d_y, s_y[0]);
}

real reduce(const real *d_x)
{
    const int grid_size = (NX + BLOCK_SIZE - 1) / BLOCK_SIZE;
    const int smem = sizeof(real) * BLOCK_SIZE;

    real h_y[1] = {0};
    real *d_y;
    CHECK(cudaMalloc(&d_y, sizeof(real)));
    CHECK(cudaMemcpy(d_y, h_y, sizeof(real), cudaMemcpyHostToDevice));

    reduce<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, NX);

    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
    CHECK(cudaFree(d_y));

    return h_y[0];
}

void timing(const real *d_x)
{
    real sum = 0;

    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x); 

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("\tTime = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("\tSum = %f.\n", sum);
}

A reduction of an array to a single value on a GPU. Data is copied to the GPU memory, where each thread adds one element to the accumulated value. Note that the thread-safe atomic operations have to be used in order to ensure that there are no race conditions. Many threads will run simultaneously on a GPU, so there is no need for a loop over the indices.

Code Array Reduce Shuffle GPU
//
// nvcc -O3 -DUSE_DP 04_array_reduce_gpu_shuffle.cu
// nvcc -O3          04_array_reduce_gpu_shuffle.cu
//
#include "error_checker.cuh"
#include <stdio.h>
#include <cooperative_groups.h>

using namespace cooperative_groups;

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 20;
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
const int BLOCK_SIZE = 128;
const unsigned FULL_MASK = 0xffffffff;

void timing(const real *d_x, const int method);

int main(int argc, const char * argv[])
{

    printf("\n--Beginning of the main function.\n");
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("\nUsing Device %d: %s\n", dev, deviceProp.name);

    real *h_x = (real *) malloc(size_array);
    for (int n = 0; n < NX; ++n)
        h_x[n] = 1.23;
    real *d_x;
    CHECK(cudaMalloc(&d_x, size_array));
    CHECK(cudaMemcpy(d_x, h_x, size_array, cudaMemcpyHostToDevice));

    printf("\n\tUsing syncwarp:\n");
    timing(d_x, 0);
    printf("\n\tUsing shuffle:\n");
    timing(d_x, 1);
    printf("\n\tUsing cooperative group:\n");
    timing(d_x, 2);

    free(h_x);
    CHECK(cudaFree(d_x));

    printf("\n--Ending of the main function.\n\n");
    return 0;
}


void __global__ reduce_syncwarp(const real *d_x, real *d_y, const int NX)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < NX) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
            s_y[tid] += s_y[tid + offset];
        __syncthreads();
    }

    for (int offset = 16; offset > 0; offset >>= 1)
    {
        if (tid < offset)
            s_y[tid] += s_y[tid + offset];
        __syncwarp();
    }

    if (tid == 0)
        atomicAdd(d_y, s_y[0]);
}

void __global__ reduce_shfl(const real *d_x, real *d_y, const int NX)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < NX) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
            s_y[tid] += s_y[tid + offset];
        __syncthreads();
    }

    real y = s_y[tid];

    for (int offset = 16; offset > 0; offset >>= 1)
        y += __shfl_down_sync(FULL_MASK, y, offset);

    if (tid == 0)
        atomicAdd(d_y, y);
}

void __global__ reduce_cp(const real *d_x, real *d_y, const int NX)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < NX) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
            s_y[tid] += s_y[tid + offset];
        __syncthreads();
    }

    real y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
        y += g.shfl_down(y, i);

    if (tid == 0)
        atomicAdd(d_y, y);
}

real reduce(const real *d_x, const int method)
{
    const int grid_size = (NX + BLOCK_SIZE - 1) / BLOCK_SIZE;
    const int smem = sizeof(real) * BLOCK_SIZE;

    real h_y[1] = {0};
    real *d_y;
    CHECK(cudaMalloc(&d_y, sizeof(real)));
    CHECK(cudaMemcpy(d_y, h_y, sizeof(real), cudaMemcpyHostToDevice));

    switch (method)
    {
        case 0:
            reduce_syncwarp<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, NX); break;
        case 1:
            reduce_shfl<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, NX); break;
        case 2:
            reduce_cp<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, NX); break;
        default:
            printf("\tWrong method.\n"); exit(1);
    }

    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
    CHECK(cudaFree(d_y));

    return h_y[0];
}

void timing(const real *d_x, const int method)
{
    real sum = 0;
    
    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x, method); 

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("\tTime = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("\tSum = %f.\n", sum);
}
Code Array Reduce Parallelism GPU
//
// nvcc -O3 -DUSE_DP 05_array_reduce_gpu_parallelism.cu
// nvcc -O3          05_array_reduce_gpu_parallelism.cu
//
#include "error_checker.cuh"
#include <stdio.h>
#include <cooperative_groups.h>

using namespace cooperative_groups;

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 20;
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240;

void timing(const real *h_x);

int main(int argc, const char * argv[])
{

    printf("\n--Beginning of the main function.\n");
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("\nUsing Device %d: %s\n", dev, deviceProp.name);

    real *h_x = (real *) malloc(size_array);
    for (int n = 0; n < NX; ++n)
        h_x[n] = 1.23;
    real *d_x;
    CHECK(cudaMalloc(&d_x, size_array));
    CHECK(cudaMemcpy(d_x, h_x, size_array, cudaMemcpyHostToDevice));

    timing(d_x);

    free(h_x);
    CHECK(cudaFree(d_x));

    printf("\n--Ending of the main function.\n\n");
    return 0;
}

void __global__ reduce_cp(const real *d_x, real *d_y, const int NX)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    extern __shared__ real s_y[];

    real y = 0.0;
    const int stride = blockDim.x * gridDim.x;
    for (int n = bid * blockDim.x + tid; n < NX; n += stride)
        y += d_x[n];
    s_y[tid] = y;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
            s_y[tid] += s_y[tid + offset];
        __syncthreads();
    }

    y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
        y += g.shfl_down(y, i);

    if (tid == 0)
        d_y[bid] = y;
}

real reduce(const real *d_x)
{
    const int ymem = sizeof(real) * GRID_SIZE;
    const int smem = sizeof(real) * BLOCK_SIZE;

    real h_y[1] = {0};
    real *d_y;
    CHECK(cudaMalloc(&d_y, ymem));

    reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, NX);
    reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);

    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
    CHECK(cudaFree(d_y));

    return h_y[0];
}

void timing(const real *d_x)
{
    real sum = 0;

    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x); 

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("\tTime = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("\tSum = %f.\n", sum);
}
Code Array Reduce Static GPU
//
// nvcc -O3 -DUSE_DP 06_array_reduce_gpu_static.cu
// nvcc -O3          06_array_reduce_gpu_static.cu
//
#include "error_checker.cuh"
#include <stdio.h>
#include <cooperative_groups.h>

using namespace cooperative_groups;

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 20;
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240;

void timing(const real *d_x);

int main(int argc, const char * argv[])
{

    printf("\n--Beginning of the main function.\n");
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("\nUsing Device %d: %s\n", dev, deviceProp.name);

    real *h_x = (real *) malloc(size_array);
    for (int n = 0; n < NX; ++n)
        h_x[n] = 1.23;
    real *d_x;
    CHECK(cudaMalloc(&d_x, size_array));
    CHECK(cudaMemcpy(d_x, h_x, size_array, cudaMemcpyHostToDevice));

    timing(d_x);

    free(h_x);
    CHECK(cudaFree(d_x));

    printf("\n--Ending of the main function.\n\n");
    return 0;
}


void __global__ reduce_cp(const real *d_x, real *d_y, const int NX)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    extern __shared__ real s_y[];

    real y = 0.0;
    const int stride = blockDim.x * gridDim.x;
    for (int n = bid * blockDim.x + tid; n < NX; n += stride)
        y += d_x[n];
    s_y[tid] = y;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
            s_y[tid] += s_y[tid + offset];
        __syncthreads();
    }

    y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
        y += g.shfl_down(y, i);

    if (tid == 0)
        d_y[bid] = y;
}

__device__ real static_y[GRID_SIZE]; 

real reduce(const real *d_x)
{
    real *d_y;
    CHECK(cudaGetSymbolAddress((void**)&d_y, static_y));
    
    const int smem = sizeof(real) * BLOCK_SIZE;

    reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, NX);
    reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);

    real h_y[1] = {0};
    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
    // CHECK(cudaMemcpyFromSymbol(h_y, static_y, sizeof(real)); // also ok

    return h_y[0];
}

void timing(const real *d_x)
{
    real sum = 0;
    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x); 

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("\tTime = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }
    printf("\tSum = %f.\n", sum);
}
Profiling Performance

ADD SOME RESULTS

Matrix Summation Examples in GPU
  • Matrix SummationExamples in GPU

Code GPU Grid Info
//
// nvcc 01_GPU_grid_block_thread_info.cu
//
#include <cuda_runtime.h>
#include <stdio.h>
#include "error_checker.cuh"

void __global__ check_grid_block_thread_info_GPU(void);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n\n");

	printf("\t***************************************************\n");
	printf("\t********** Output for num_element = 1024 **********\n");
	printf("\t***************************************************\n\n");

    int num_elements = 1024;
	printf("\t\tThere are %d data, which can be distributed:\n", num_elements);

    // define grid and block structure
    dim3 block(1024); // == (block.x = 1024; block.y = 1; block.z = 1;)
    dim3 grid((num_elements + block.x - 1) / block.x);
    printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);

    // reset block
    block.x = 512;
    grid.x  = (num_elements + block.x - 1) / block.x;
    printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);

    // reset block
    block.x = 256;
    grid.x  = (num_elements + block.x - 1) / block.x;
    printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);

    // reset block
    block.x = 128;
    grid.x  = (num_elements + block.x - 1) / block.x;
    printf("\t\t- grid.x=%d, block.x=%d\n\n", grid.x, block.x);

    CHECK(cudaDeviceSynchronize());


	printf("\t***************************************************\n");
	printf("\t*********** Output for num_element = 16 ***********\n");
	printf("\t***************************************************\n\n");

    // reset the total number of data element
    num_elements = 16;

    // reset grid and block structure
    block.x = 2;
	grid.x = (num_elements + block.x - 1) / block.x;

    // check grid and block info from host side
    printf("\t\t- CPU output -- grid.x=%d,  grid.y=%d,  grid.z=%d\n",  grid.x,  grid.y,  grid.z);
    printf("\t\t- CPU output -- block.x=%d, block.y=%d, block.z=%d\n", block.x, block.y, block.z);
	putchar('\n');

    check_grid_block_thread_info_GPU<<<grid, block>>>();

    CHECK(cudaDeviceReset());

	printf("\n--Ending of the main function.\n\n");
    return 0;
}


void __global__ check_grid_block_thread_info_GPU(void)
{
    printf("\t\t- GPU output -- gridDim=(%d, %d, %d)   blockDim=(%d, %d, %d)  blockIdx=(%d, %d, %d)  threadIdx=(%d, %d, %d)\n", 
		gridDim.x,   gridDim.y,   gridDim.z,  blockDim.x,  blockDim.y,  blockDim.z,	
		blockIdx.x,  blockIdx.y,  blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);
}
Code Matrix Thread Index Info
//
// nvcc 02_matrix_thread_index_info.cu
//
#include <cstdio>
#include <cuda_runtime.h>
#include "error_checker.cuh"

void initialInt(int *matrix, int nxy);
void printMatrix(int *h_matrixA, const int nx, const int ny);
void __global__ printGPUIdx(int *d_matrixA, const int nx, const int ny);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n");

    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("\nUsing Device %d: %s\n", dev, deviceProp.name); // Using Device 0: NVIDIA GeForce GT 1030

    int nx = 8;
    int ny = 6;
    int nxy = nx * ny;
	int size_matrix = nxy*(sizeof(int));

    // malloc host mem
    int *h_matrixA;
    h_matrixA = (int *)malloc(size_matrix);
    initialInt(h_matrixA, nxy);
    printMatrix(h_matrixA, nx, ny);

    //malloc device mem
    int *d_matrixA;
    cudaMalloc((void **)&d_matrixA, size_matrix); // 
    cudaMemcpy(d_matrixA, h_matrixA, size_matrix, cudaMemcpyHostToDevice); // copy data from CPU to GPU

    // setup excution configuration
    dim3 block(4, 2);
    dim3 grid((nx + block.x-1)/block.x, (ny + block.y-1)/block.y);
	printf("\ngrid info  >>> grid.x=%d   grid.y=%d   grid.z=%d.\n", grid.x, grid.y, grid.z);
	printf("block info >>> block.x=%d  block.y=%d  block.z=%d.\n\n", block.x, block.y, block.z);

    //invoke kernel
    printGPUIdx<<<grid, block>>>(d_matrixA, nx, ny);
    cudaDeviceSynchronize();
	printf("\n");

    //free host and device
    free(h_matrixA);
    cudaFree(d_matrixA);

    //reset device
    CHECK(cudaDeviceReset());

	printf("\n--Ending of the main function.\n\n");

    return 0;
}


void initialInt(int *matrix, int nxy){
    for(int i = 0; i < nxy; i++)
        matrix[i] = i;
}


void printMatrix(int *h_matrixA, const int nx, const int ny){
    int *ic = h_matrixA;
    printf("\nMatrix:%d, %d\n", nx, ny);
    for(int iy = 0; iy < ny; iy++){
        for(int ix=0; ix < nx; ix++)
            printf("%3d ",ic[ix]);
        ic += nx;
        printf("\n");
    }
}


void __global__ printGPUIdx(int *d_matrixA, const int nx, const int ny){
    int ix = threadIdx.x + blockIdx.x*blockDim.x;
    int iy = threadIdx.y + blockIdx.y*blockDim.y;
    unsigned int idx = iy*nx + ix;
    printf("block_id (%d %d) thread_id (%d,%d) coordinate (%d %d) global_index (%d) value (%d)\n",
		blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y, ix, iy, idx, d_matrixA[idx]);
}
Code Matrix Summation
// 
// nvcc 03_matrix_summation_GPU_2D2D_2D1D_1D1D.cu
// 
#include <cuda_runtime.h>
#include <stdio.h>
#include "error_checker.cuh"

const double EPSILON = 1.0E-8;

void matrix_initialization(float *ip, const int size);
void matrix_summation_on_CPU(float *A, float *B, float *C, const int, const int);
void check_results_from_CPU_GPU(float *fromCPU, float *fromGPU, const int);
void __global__ matrix_summation_on_GPU_1D1D(float *A, float *B, float *C, int, int);
void __global__ matrix_summation_on_GPU_2D1D(float *A, float *B, float *C, int, int);
void __global__ matrix_summation_on_GPU_2D2D(float *A, float *B, float *C, int, int);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n");

    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("\n\tUsing Device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));

    // set up data size of matrix
    int nx = 1 << 10;
    int ny = 1 << 10;

    int nxy = nx * ny;
    int size_matrix = nxy * sizeof(float);
    printf("\n\tMatrix size: nx=%d ny=%d\n", nx, ny);

    // malloc host memory
    float *h_matrixA, *h_matrixB, *h_matrixSumFromCPU, *h_matrixSumFromGPU;
    h_matrixA = (float *)malloc(size_matrix);
    h_matrixB = (float *)malloc(size_matrix);
    h_matrixSumFromCPU = (float *)malloc(size_matrix);
    h_matrixSumFromGPU = (float *)malloc(size_matrix);

    // initialize data at host side and define a timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    cudaEventQuery(start);
    matrix_initialization(h_matrixA, nxy);
    matrix_initialization(h_matrixB, nxy);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float elapsed_time;
    cudaEventElapsedTime(&elapsed_time, start, stop); //CHECK();
    printf("\tMatrix initialization on host(CPU) elapsed %f sec\n", elapsed_time);

    memset(h_matrixSumFromCPU, 0, size_matrix);
    memset(h_matrixSumFromGPU, 0, size_matrix);

    // summation of matrix elements at host(CPU) side
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    cudaEventQuery(start);
    matrix_summation_on_CPU(h_matrixA, h_matrixB, h_matrixSumFromCPU, nx, ny);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsed_time, start, stop);
    printf("\tMatrix summation on host(CPU) elapsed %f sec\n", elapsed_time);


    // malloc device global memory
    float *d_matrixA, *d_matrixB, *d_matrixC;
    CHECK(cudaMalloc((void **)&d_matrixA, size_matrix));
    CHECK(cudaMalloc((void **)&d_matrixB, size_matrix));
    CHECK(cudaMalloc((void **)&d_matrixC, size_matrix));

    // transfer data from host to device
    CHECK(cudaMemcpy(d_matrixA, h_matrixA, size_matrix, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_matrixB, h_matrixB, size_matrix, cudaMemcpyHostToDevice));

//---------------
    // invoke kernel at host side for summation on GPU using 2D_grid and 2D_block
    int dimx = 32;
    int dimy = 32;
    dim3 block(dimx, dimy); // (32, 32, 1)
    dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); //(32, 32, 1)

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    cudaEventQuery(start);
    matrix_summation_on_GPU_2D2D<<<grid, block>>>(d_matrixA, d_matrixB, d_matrixC, nx, ny);
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsed_time, start, stop);
    printf("\n\tMatrix summation on GPU (2D_grid 2D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n", 
           grid.x, grid.y, block.x, block.y, elapsed_time);
    cudaGetLastError(); // check kernel error

    // copy kernel result back to host side
    cudaMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, cudaMemcpyDeviceToHost);

    // comparison of computation results
    check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------

    // invoke kernel at host side for summation on GPU using 2D_grid and 1D_block
    dimy = 1;
	block.y = dimy; // block (32, 1, 1)
	grid.x = (nx + block.x - 1) / block.x;
	grid.y = ny; // grid (32, 1024, 1)

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    cudaEventQuery(start);
    matrix_summation_on_GPU_2D1D<<<grid, block>>>(d_matrixA, d_matrixB, d_matrixC, nx, ny);
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsed_time, start, stop);
    printf("\n\tMatrix summation on GPU (2D_grid 1D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n", 
           grid.x, grid.y, block.x, block.y, elapsed_time);
    cudaGetLastError(); // check kernel error

    // copy kernel result back to host side
    cudaMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, cudaMemcpyDeviceToHost);

    // comparison of computation results
    check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------

    // invoke kernel at host side for summation on GPU using 1D_grid and 1D_block
    dimy = 1;
	block.y = dimy; // block (32, 1, 1)
	grid.x = (nx + block.x - 1) / block.x;
	grid.y = 1; // grid (32, 1, 1)

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    cudaEventQuery(start);
    matrix_summation_on_GPU_1D1D<<<grid, block>>>(d_matrixA, d_matrixB, d_matrixC, nx, ny);
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsed_time, start, stop);
    printf("\n\tMatrix summation on GPU (1D_grid 1D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n", 
           grid.x, grid.y, block.x, block.y, elapsed_time);
    cudaGetLastError(); // check kernel error

    // copy kernel result back to host side
    cudaMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, cudaMemcpyDeviceToHost);

    // comparison of computation results
    check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------

	// destroy start and stop events
    CHECK(cudaEventDestroy(start));
    CHECK(cudaEventDestroy(stop));	

    // free host memory and device global memory
    free(h_matrixA);
    free(h_matrixB);
    free(h_matrixSumFromCPU);
    free(h_matrixSumFromGPU);
    cudaFree(d_matrixA);
    cudaFree(d_matrixB);
    cudaFree(d_matrixC);

    CHECK(cudaDeviceReset());

	printf("\n--Ending of the main function.\n\n");
    return 0;
}


void matrix_initialization(float *ip, const int size)
{
    for(int i = 0; i < size; i++)
        ip[i] = (float)(rand() & 0xFF) / 10.0f;
}


void matrix_summation_on_CPU(float *matrixA, float *matrixB, float *matrixC,
	const int nx, const int ny)
{
    float *ia = matrixA;
    float *ib = matrixB;
    float *ic = matrixC;

    for (int iy = 0; iy < ny; iy++)
    {
        for (int ix = 0; ix < nx; ix++)
            ic[ix] = ia[ix] + ib[ix];
        ia += nx;
        ib += nx;
        ic += nx;
    }
}


void __global__ matrix_summation_on_GPU_2D2D(float *matrixA, float *matrixB,
	float *matrixC, int nx, int ny)
{
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int idx = iy * nx + ix;
    if (ix < nx && iy < ny)
        matrixC[idx] = matrixA[idx] + matrixB[idx];
}
void __global__ matrix_summation_on_GPU_2D1D(float *matrixA, float *matrixB,
	float *matrixC, int nx, int ny)
{
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = blockIdx.y;
    unsigned int idx = iy * nx + ix;
    if (ix < nx && iy < ny)
        matrixC[idx] = matrixA[idx] + matrixB[idx];
}
void __global__ matrix_summation_on_GPU_1D1D(float *matrixA, float *matrixB,
	float *matrixC, int nx, int ny)
{
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    if (ix < nx)
        for (int iy = 0; iy < ny; iy++)
        {
            int idx = iy * nx + ix;
            matrixC[idx] = matrixA[idx] + matrixB[idx];
        }      
}


void check_results_from_CPU_GPU(float *h_matrixSumFromCPU,
	float *h_matrixSumFromGPU, const int N)
{
    bool has_error = false;
    for (int i = 0; i < N; i++)
    {
        if (abs(h_matrixSumFromCPU[i] - h_matrixSumFromGPU[i]) > EPSILON)
        {
            has_error = true;
            printf("host %f gpu %f\n", h_matrixSumFromCPU[i], h_matrixSumFromGPU[i]);
            break;
        }
    }
    printf("\tChecking matrix summation results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
Profiling Performance

ADD SOME RESULTS

Parallel reduction Examples in CPU/GPU
  • Parallel reduction Examples in CPU/GPU

Code Parallel Reduction CPU1
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;

    printf("Reducing over %d values.\n", numElements);

    float* data   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        data[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    float result = 0.0;

    // Timing
    clock_t start = clock();

    // Main loop
    for (int i = 0; i < numElements; i++)
    {
        result = result + data[i];
    }

    // Timing
    clock_t finish = clock();

    printf("The result is: %f\n", result);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(data);
    
    return 0;
}
Code Parallel Reduction CPU2
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

void reduce(const float* data, float* result, int numElements)
{
    for (int i = 0; i < numElements/2; i++)
    {
        result[i] = data[2*i] + data[2*i + 1];
    }
    if (numElements % 2 != 0)
    {
        result[0] += data[numElements-1];
    }
}


int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;

    printf("Reducing over %d values.\n", numElements);

    float* data   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        data[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    float* result = (float*)calloc(numElements, sizeof(float));

    // Timing
    clock_t start = clock();

    reduce(data, result, numElements);
    // Main loop
    for (int numElementsCurrent = numElements/2; numElementsCurrent > 1; numElementsCurrent /= 2)
    {
        reduce(result, result, numElementsCurrent);
    }

    // Timing
    clock_t finish = clock();

    printf("The result is: %f\n", result[0]);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(data);
    free(result);
    
    return 0;
}
Code Parallel Reduction GPU1
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        atomicAdd(result, data[i]);
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;

    printf("Reducing over %d values.\n", numElements);

    float* h_data   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    float h_result = 0.0;

    float* d_data;
    cudaMalloc((void**)&d_data, numElements*sizeof(float));
    cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);
    
    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/BLOCK_SIZE + 1;

    float* d_result;
    cudaMalloc((void**)&d_result, 1*sizeof(float));
    cudaMemset(d_result, 0.0, 1);

    // Timing
    clock_t start = clock();

    // Call the reduction kernel
    reduce_kernel<<<numBlocks, threadsPerBlock>>>(d_data, d_result, numElements);

    cudaMemcpy(&h_result, d_result, 1*sizeof(float), cudaMemcpyDeviceToHost);

    // Timing
    clock_t finish = clock();

    printf("The result is: %f\n", h_result);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(h_data);
    cudaFree(d_data);
    cudaFree(d_result);
    
    return 0;
}
Code Parallel Reduction GPU2
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
    if(index < numElements)
    {
        return data[index];
    }
    else
    {
        return 0.0f;
    }
}

__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
    int d_i = threadIdx.x + blockIdx.x*blockDim.x;
    
    result[d_i] = getValue(data, 2*d_i, numElements) + getValue(data, 2*d_i + 1, numElements);

    if (d_i == 0 && numElements % 2 != 0)
    {
        result[d_i] += data[numElements-1];
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;

    printf("Reducing over %d values.\n", numElements);

    float* h_data   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    float h_result = 0.0;

    float* d_data;
    cudaMalloc((void**)&d_data, numElements*sizeof(float));
    cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/2/BLOCK_SIZE + 1;

    float* d_result1;
    float* d_result2;
    cudaMalloc((void**)&d_result1, numElements*sizeof(float));
    cudaMalloc((void**)&d_result2, numElements*sizeof(float));

    // Timing
    clock_t start = clock();

    // Main loop
    reduce_kernel<<<numBlocks, threadsPerBlock>>>(d_data, d_result1, numElements);
    for (int numElementsCurrent = numElements/2; numElementsCurrent > 1; numElementsCurrent = numElementsCurrent/2)
    {
        int numBlocksCurrent = numElementsCurrent/2/BLOCK_SIZE + 1;
        reduce_kernel<<<numBlocksCurrent, threadsPerBlock>>>(d_result1, d_result2, numElementsCurrent);
        std::swap(d_result1, d_result2);
    }

    cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);

    // Timing
    clock_t finish = clock();

    printf("The result is: %f\n", h_result);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(h_data);
    cudaFree(d_data);
    cudaFree(d_result1);
    cudaFree(d_result2);
    
    return 0;
}
Code Parallel Reduction GPU3
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
    if(index < numElements)
    {
        return data[index];
    }
    else
    {
        return 0.0f;
    }
}

__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
    extern __shared__ float s_data[];

    int s_i = threadIdx.x;
    int d_i = threadIdx.x + blockIdx.x*blockDim.x;
    s_data[s_i] = getValue(data, d_i, numElements);
    
    for (int offset = 1; offset < blockDim.x; offset *= 2)
    {
        __syncthreads();
        if (s_i % (2*offset) == 0)
        {
            s_data[s_i] += s_data[s_i + offset];
        }
    }

    if (s_i == 0)
    {
        result[blockIdx.x] = s_data[0];
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;

    printf("Reducing over %d values.\n", numElements);

    float* h_data   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    float h_result = 0.0;

    float* d_data;
    cudaMalloc((void**)&d_data, numElements*sizeof(float));
    cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/BLOCK_SIZE + 1;

    float* d_result1;
    float* d_result2;
    cudaMalloc((void**)&d_result1, numBlocks*sizeof(float));
    cudaMalloc((void**)&d_result2, numBlocks*sizeof(float));

    // Timing
    clock_t start = clock();

    // Main loop
    reduce_kernel<<<numBlocks, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_data, d_result1, numElements);
    for (int numElementsCurrent = numBlocks; numElementsCurrent > 1; )
    {
        int numBlocksCurrent = numElementsCurrent/BLOCK_SIZE + 1;
        reduce_kernel<<<numBlocksCurrent, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_result1, d_result2, numElementsCurrent);
        numElementsCurrent = numBlocksCurrent;
        std::swap(d_result1, d_result2);
    }

    cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);

    // Timing
    clock_t finish = clock();

    printf("The result is: %f\n", h_result);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(h_data);
    cudaFree(d_data);
    cudaFree(d_result1);
    cudaFree(d_result2);
    
    return 0;
}
Code Parallel Reduction GPU4
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
    if(index < numElements)
    {
        return data[index];
    }
    else
    {
        return 0.0f;
    }
}

__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
    extern __shared__ float s_data[];

    int s_i = threadIdx.x;
    int d_i = threadIdx.x + blockIdx.x*blockDim.x;
    s_data[s_i] = getValue(data, d_i, numElements);
    
    for (int offset = 1; offset < blockDim.x; offset *= 2)
    {
        __syncthreads();
        int index = 2 * offset * s_i;
        if (index < blockDim.x)
        {
            s_data[index] += s_data[index + offset];
        }
    }

    if (s_i == 0)
    {
        result[blockIdx.x] = s_data[0];
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;

    printf("Reducing over %d values.\n", numElements);

    float* h_data   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    float h_result = 0.0;

    float* d_data;
    cudaMalloc((void**)&d_data, numElements*sizeof(float));
    cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/BLOCK_SIZE + 1;

    float* d_result1;
    float* d_result2;
    cudaMalloc((void**)&d_result1, numBlocks*sizeof(float));
    cudaMalloc((void**)&d_result2, numBlocks*sizeof(float));

    // Timing
    clock_t start = clock();

    // Main loop
    reduce_kernel<<<numBlocks, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_data, d_result1, numElements);
    for (int numElementsCurrent = numBlocks; numElementsCurrent > 1; )
    {
        int numBlocksCurrent = numElementsCurrent/BLOCK_SIZE + 1;
        reduce_kernel<<<numBlocksCurrent, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_result1, d_result2, numElementsCurrent);
        numElementsCurrent = numBlocksCurrent;
        std::swap(d_result1, d_result2);
    }

    cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);

    // Timing
    clock_t finish = clock();

    printf("The result is: %f\n", h_result);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(h_data);
    cudaFree(d_data);
    cudaFree(d_result1);
    cudaFree(d_result2);
    
    return 0;
}
Code Parallel Reduction GPU5
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
    if(index < numElements)
    {
        return data[index];
    }
    else
    {
        return 0.0f;
    }
}

__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
    extern __shared__ float s_data[];

    int s_i = threadIdx.x;
    int d_i = threadIdx.x + blockIdx.x*blockDim.x;
    s_data[s_i] = getValue(data, d_i, numElements);
    
    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
    {
        __syncthreads();
        if (s_i < offset)
        {
            s_data[s_i] += s_data[s_i + offset];
        }
    }

    if (s_i == 0)
    {
        result[blockIdx.x] = s_data[0];
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;

    printf("Reducing over %d values.\n", numElements);

    float* h_data   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    float h_result = 0.0;

    float* d_data;
    cudaMalloc((void**)&d_data, numElements*sizeof(float));
    cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/BLOCK_SIZE + 1;

    float* d_result1;
    float* d_result2;
    cudaMalloc((void**)&d_result1, numBlocks*sizeof(float));
    cudaMalloc((void**)&d_result2, numBlocks*sizeof(float));

    // Timing
    clock_t start = clock();

    // Main loop
    reduce_kernel<<<numBlocks, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_data, d_result1, numElements);
    for (int numElementsCurrent = numBlocks; numElementsCurrent > 1; )
    {
        int numBlocksCurrent = numElementsCurrent/BLOCK_SIZE + 1;
        reduce_kernel<<<numBlocksCurrent, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_result1, d_result2, numElementsCurrent);
        numElementsCurrent = numBlocksCurrent;
        std::swap(d_result1, d_result2);
    }

    cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);

    // Timing
    clock_t finish = clock();

    printf("The result is: %f\n", h_result);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(h_data);
    cudaFree(d_data);
    cudaFree(d_result1);
    cudaFree(d_result2);
    
    return 0;
}
Code Parallel Reduction GPU6
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
    if(index < numElements)
    {
        return data[index];
    }
    else
    {
        return 0.0f;
    }
}

__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
    extern __shared__ float s_data[];

    int s_i = threadIdx.x;
    int d_i = threadIdx.x + blockIdx.x*2*blockDim.x;
    s_data[s_i] = getValue(data, d_i, numElements) + getValue(data, d_i + blockDim.x, numElements);
    
    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
    {
        __syncthreads();
        if (s_i < offset)
        {
            s_data[s_i] += s_data[s_i + offset];
        }
    }

    if (s_i == 0)
    {
        result[blockIdx.x] = s_data[0];
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;

    printf("Reducing over %d values.\n", numElements);

    float* h_data   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    float h_result = 0.0;

    float* d_data;
    cudaMalloc((void**)&d_data, numElements*sizeof(float));
    cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/2/BLOCK_SIZE + 1;

    float* d_result1;
    float* d_result2;
    cudaMalloc((void**)&d_result1, numBlocks*sizeof(float));
    cudaMalloc((void**)&d_result2, numBlocks*sizeof(float));

    // Timing
    clock_t start = clock();

    // Main loop
    reduce_kernel<<<numBlocks, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_data, d_result1, numElements);
    for (int numElementsCurrent = numBlocks; numElementsCurrent > 1; )
    {
        int numBlocksCurrent = numElementsCurrent/2/BLOCK_SIZE + 1;
        reduce_kernel<<<numBlocksCurrent, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_result1, d_result2, numElementsCurrent);
        numElementsCurrent = numBlocksCurrent;
        std::swap(d_result1, d_result2);
    }

    cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);

    // Timing
    clock_t finish = clock();

    printf("The result is: %f\n", h_result);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(h_data);
    cudaFree(d_data);
    cudaFree(d_result1);
    cudaFree(d_result2);
    
    return 0;
}
Code Parallel Reduction GPU7
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
    if(index < numElements)
    {
        return data[index];
    }
    else
    {
        return 0.0f;
    }
}

__device__ void warpReduce(volatile float* sdata, int s_i)
{
    sdata[s_i] += sdata[s_i + 32];
    sdata[s_i] += sdata[s_i + 16];
    sdata[s_i] += sdata[s_i + 8];
    sdata[s_i] += sdata[s_i + 4];
    sdata[s_i] += sdata[s_i + 2];
    sdata[s_i] += sdata[s_i + 1];
}

__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
    extern __shared__ float s_data[];

    int s_i = threadIdx.x;
    int d_i = threadIdx.x + blockIdx.x*2*blockDim.x;
    s_data[s_i] = getValue(data, d_i, numElements) + getValue(data, d_i + blockDim.x, numElements);
        
    for (int offset = blockDim.x / 2; offset > 32; offset >>= 1)
    {
        __syncthreads();
        if (s_i < offset)
        {
            s_data[s_i] += s_data[s_i + offset];
        }
    }

    if (s_i < 32)
    {
        warpReduce(s_data, s_i);
    }

    if (s_i == 0)
    {
        result[blockIdx.x] = s_data[0];
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;

    printf("Reducing over %d values.\n", numElements);

    float* h_data   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    float h_result = 0.0;

    float* d_data;
    cudaMalloc((void**)&d_data, numElements*sizeof(float));
    cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/2/BLOCK_SIZE + 1;

    float* d_result1;
    float* d_result2;
    cudaMalloc((void**)&d_result1, numBlocks*sizeof(float));
    cudaMalloc((void**)&d_result2, numBlocks*sizeof(float));

    // Timing
    clock_t start = clock();

    // Main loop
    reduce_kernel<<<numBlocks, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_data, d_result1, numElements);
    for (int numElementsCurrent = numBlocks; numElementsCurrent > 1; )
    {
        int numBlocksCurrent = numElementsCurrent/2/BLOCK_SIZE + 1;
        reduce_kernel<<<numBlocksCurrent, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_result1, d_result2, numElementsCurrent);
        numElementsCurrent = numBlocksCurrent;
        std::swap(d_result1, d_result2);
    }

    cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);

    // Timing
    clock_t finish = clock();

    printf("The result is: %f\n", h_result);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(h_data);
    cudaFree(d_data);
    cudaFree(d_result1);
    cudaFree(d_result2);
    
    return 0;
}
Profiling Performance

ADD SOME RESULTS

Task Parallelism Examples in CPU/GPU
  • Task Parallelism Examples in CPU/GPU

Code Task Parallelism Async Vers1 CPP
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <math.h>
#include <time.h>

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

void func1(const float* in, float* out, int numElements)
{
    for (int i = 0; i < numElements; i++)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = std::sin(value);
        }
        out[i] = value;
    }
}

void func2(const float* in, float* out, int numElements)
{
    for (int i = 0; i < numElements; i++)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = -std::sin(value);
        }
        out[i] = value;
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

    printf("Transforming %d values.\n", numElements);

    float* data1   = (float*)calloc(numElements, sizeof(float));
    float* data2   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        data1[i] = float(rand())/float(RAND_MAX + 1.0);
        data2[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    // Timing
    clock_t start = clock();

    func1(data1, data1, numElements);
    func2(data2, data2, numElements);

    // Timing
    clock_t finish = clock();

    printf("The results are:\n");
    for (int i = 0; i < numValuesToPrint; i++)
    {
        printf("%f, %f\n", data1[i], data2[i]);
    }
    printf("...\n");
    for (int i = numElements - numValuesToPrint; i < numElements; i++)
    {
        printf("%f, %f\n", data1[i], data2[i]);
    }
    double sum1 = 0.0;
    double sum2 = 0.0;
    for (int i = 0; i < numElements; i++)
    {
        sum1 += data1[i];
        sum2 += data2[i];
    }
    printf("The summs are: %f and %f\n", sum1, sum2);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(data1);
    free(data2);
    
    return 0;
}
Code Task Parallelism Async Vers1 GPU
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

__global__ void func1_kernel(const float* in, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = sinf(value);
        }
        out[i] = value;
    }
}

__global__ void func2_kernel(const float* in, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = -sinf(value);
        }
        out[i] = value;
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

    printf("Transforming %d values.\n", numElements);

    float* h_data1   = (float*)calloc(numElements, sizeof(float));
    float* h_data2   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
        h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/BLOCK_SIZE + 1;

    float* d_data1;
    float* d_data2;

    cudaMalloc((void**)&d_data1, numElements*sizeof(float));
    cudaMalloc((void**)&d_data2, numElements*sizeof(float));

    // Timing
    clock_t start = clock();

    cudaMemcpy(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice);
    func1_kernel<<<numBlocks, threadsPerBlock>>>(d_data1, d_data1, numElements);
    cudaMemcpy(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost);

    cudaMemcpy(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice);
    func2_kernel<<<numBlocks, threadsPerBlock>>>(d_data2, d_data2, numElements);
    cudaMemcpy(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost);

    // Timing
    clock_t finish = clock();

    printf("The results are:\n");
    for (int i = 0; i < numValuesToPrint; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    printf("...\n");
    for (int i = numElements - numValuesToPrint; i < numElements; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    double sum1 = 0.0;
    double sum2 = 0.0;
    for (int i = 0; i < numElements; i++)
    {
        sum1 += h_data1[i];
        sum2 += h_data2[i];
    }
    printf("The summs are: %f and %f\n", sum1, sum2);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(h_data1);
    free(h_data2);
    
    cudaFree(d_data1);
    cudaFree(d_data2);

    return 0;
}
Code Task Parallelism Async Vers1 GPU
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

__global__ void func1_kernel(const float* in, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = sinf(value);
        }
        out[i] = value;
    }
}

__global__ void func2_kernel(const float* in, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = -sinf(value);
        }
        out[i] = value;
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

    printf("Transforming %d values.\n", numElements);

    float* h_data1;
    float* h_data2;

    cudaMallocHost((void**)&h_data1, numElements*sizeof(float));
    cudaMallocHost((void**)&h_data2, numElements*sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
        h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/BLOCK_SIZE + 1;

    float* d_data1;
    float* d_data2;

    cudaMalloc((void**)&d_data1, numElements*sizeof(float));
    cudaMalloc((void**)&d_data2, numElements*sizeof(float));

    cudaStream_t stream1;
    cudaStream_t stream2;

    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);

    // Timing
    clock_t start = clock();

    cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1);
    func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements);
    cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1);

    cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2);
    func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data2, d_data2, numElements);
    cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2);

    cudaDeviceSynchronize();

    // Timing
    clock_t finish = clock();

    printf("The results are:\n");
    for (int i = 0; i < numValuesToPrint; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    printf("...\n");
    for (int i = numElements - numValuesToPrint; i < numElements; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    double sum1 = 0.0;
    double sum2 = 0.0;
    for (int i = 0; i < numElements; i++)
    {
        sum1 += h_data1[i];
        sum2 += h_data2[i];
    }
    printf("The summs are: %f and %f\n", sum1, sum2);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    cudaFreeHost(h_data1);
    cudaFreeHost(h_data2);

    cudaFree(d_data1);
    cudaFree(d_data2);
    
    return 0;
}
Code Task Parallelism Async Vers2 CPP
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <math.h>
#include <time.h>

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

void func1(const float* in, float* out, int numElements)
{
    for (int i = 0; i < numElements; i++)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = std::sin(value);
        }
        out[i] = value;
    }
}

void func2(const float* in1, const float* in2, float* out, int numElements)
{
    for (int i = 0; i < numElements; i++)
    {
        float value1 = in1[numElements - i - 1];
        float value2 = in2[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value2 = -std::sin(value2);
        }
        out[i] = value1 + value2;
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

    printf("Transforming %d values.\n", numElements);

    float* data1   = (float*)calloc(numElements, sizeof(float));
    float* data2   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        data1[i] = float(rand())/float(RAND_MAX + 1.0);
        data2[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    // Timing
    clock_t start = clock();

    func1(data1, data1, numElements);
    func2(data1, data2, data2, numElements);

    // Timing
    clock_t finish = clock();

    printf("The results are:\n");
    for (int i = 0; i < numValuesToPrint; i++)
    {
        printf("%f, %f\n", data1[i], data2[i]);
    }
    printf("...\n");
    for (int i = numElements - numValuesToPrint; i < numElements; i++)
    {
        printf("%f, %f\n", data1[i], data2[i]);
    }
    double sum1 = 0.0;
    double sum2 = 0.0;
    for (int i = 0; i < numElements; i++)
    {
        sum1 += data1[i];
        sum2 += data2[i];
    }
    printf("The summs are: %f and %f\n", sum1, sum2);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(data1);
    free(data2);
    
    return 0;
}
Code Task Parallelism Async Vers2 GPU
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

__global__ void func1_kernel(const float* in, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = sinf(value);
        }
        out[i] = value;
    }
}

__global__ void func2_kernel(const float* in1, const float* in2, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value1 = in1[numElements - i - 1];
        float value2 = in2[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value2 = -sinf(value2);
        }
        out[i] = value1 + value2;
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

    printf("Transforming %d values.\n", numElements);

    float* h_data1   = (float*)calloc(numElements, sizeof(float));
    float* h_data2   = (float*)calloc(numElements, sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
        h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/BLOCK_SIZE + 1;

    float* d_data1;
    float* d_data2;

    cudaMalloc((void**)&d_data1, numElements*sizeof(float));
    cudaMalloc((void**)&d_data2, numElements*sizeof(float));

    // Timing
    clock_t start = clock();

    cudaMemcpy(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice);
    func1_kernel<<<numBlocks, threadsPerBlock>>>(d_data1, d_data1, numElements);
    cudaMemcpy(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost);

    cudaMemcpy(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice);
    func2_kernel<<<numBlocks, threadsPerBlock>>>(d_data1, d_data2, d_data2, numElements);
    cudaMemcpy(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost);

    // Timing
    clock_t finish = clock();

    printf("The results are:\n");
    for (int i = 0; i < numValuesToPrint; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    printf("...\n");
    for (int i = numElements - numValuesToPrint; i < numElements; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    double sum1 = 0.0;
    double sum2 = 0.0;
    for (int i = 0; i < numElements; i++)
    {
        sum1 += h_data1[i];
        sum2 += h_data2[i];
    }
    printf("The summs are: %f and %f\n", sum1, sum2);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    free(h_data1);
    free(h_data2);
    
    cudaFree(d_data1);
    cudaFree(d_data2);

    return 0;
}
Code Task Parallelism Async Vers2 GPU
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

__global__ void func1_kernel(const float* in, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = sinf(value);
        }
        out[i] = value;
    }
}

__global__ void func2_kernel(const float* in1, const float* in2, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value1 = in1[numElements - i - 1];
        float value2 = in2[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value2 = -sinf(value2);
        }
        out[i] = value1 + value2;
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

    printf("Transforming %d values.\n", numElements);

    float* h_data1;
    float* h_data2;

    cudaMallocHost((void**)&h_data1, numElements*sizeof(float));
    cudaMallocHost((void**)&h_data2, numElements*sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
        h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/BLOCK_SIZE + 1;

    float* d_data1;
    float* d_data2;

    cudaMalloc((void**)&d_data1, numElements*sizeof(float));
    cudaMalloc((void**)&d_data2, numElements*sizeof(float));

    cudaStream_t stream1;
    cudaStream_t stream2;

    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);

    // Timing
    clock_t start = clock();

    cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1);
    func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements);
    cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1);
    
    cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2);
    func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data1, d_data2, d_data2, numElements);
    cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2);

    cudaDeviceSynchronize();

    // Timing
    clock_t finish = clock();

    printf("The results are:\n");
    for (int i = 0; i < numValuesToPrint; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    printf("...\n");
    for (int i = numElements - numValuesToPrint; i < numElements; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    double sum1 = 0.0;
    double sum2 = 0.0;
    for (int i = 0; i < numElements; i++)
    {
        sum1 += h_data1[i];
        sum2 += h_data2[i];
    }
    printf("The summs are: %f and %f\n", sum1, sum2);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    cudaFreeHost(h_data1);
    cudaFreeHost(h_data2);

    cudaFree(d_data1);
    cudaFree(d_data2);
    
    return 0;
}
Code Task Parallelism Async Vers3 GPU
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

__global__ void func1_kernel(const float* in, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = sinf(value);
        }
        out[i] = value;
    }
}

__global__ void func2_kernel(const float* in1, const float* in2, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value1 = in1[numElements - i - 1];
        float value2 = in2[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value2 = -sinf(value2);
        }
        out[i] = value1 + value2;
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

    printf("Transforming %d values.\n", numElements);

    float* h_data1;
    float* h_data2;

    cudaMallocHost((void**)&h_data1, numElements*sizeof(float));
    cudaMallocHost((void**)&h_data2, numElements*sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
        h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/BLOCK_SIZE + 1;

    float* d_data1;
    float* d_data2;

    cudaMalloc((void**)&d_data1, numElements*sizeof(float));
    cudaMalloc((void**)&d_data2, numElements*sizeof(float));

    cudaStream_t stream1;
    cudaStream_t stream2;

    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);

    // Timing
    clock_t start = clock();

    cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1);
    cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2);
    func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements);

    cudaDeviceSynchronize();

    func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data1, d_data2, d_data2, numElements);
    cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1);
    cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2);

    cudaDeviceSynchronize();

    // Timing
    clock_t finish = clock();

    printf("The results are:\n");
    for (int i = 0; i < numValuesToPrint; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    printf("...\n");
    for (int i = numElements - numValuesToPrint; i < numElements; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    double sum1 = 0.0;
    double sum2 = 0.0;
    for (int i = 0; i < numElements; i++)
    {
        sum1 += h_data1[i];
        sum2 += h_data2[i];
    }
    printf("The summs are: %f and %f\n", sum1, sum2);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    cudaFreeHost(h_data1);
    cudaFreeHost(h_data2);

    cudaFree(d_data1);
    cudaFree(d_data2);
    
    return 0;
}
Code Task Parallelism Async Vers4 GPU
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

__global__ void func1_kernel(const float* in, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value = in[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value = sinf(value);
        }
        out[i] = value;
    }
}

__global__ void func2_kernel(const float* in1, const float* in2, float* out, int numElements)
{
    int i = threadIdx.x + blockIdx.x*blockDim.x;
    if (i < numElements)
    {
        float value1 = in1[numElements - i - 1];
        float value2 = in2[i];
        for (int iter = 0; iter < numIterations; iter++)
        {
            value2 = -sinf(value2);
        }
        out[i] = value1 + value2;
    }
}

int main(int argc, char* argv[])
{

    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

    printf("Transforming %d values.\n", numElements);

    float* h_data1;
    float* h_data2;

    cudaMallocHost((void**)&h_data1, numElements*sizeof(float));
    cudaMallocHost((void**)&h_data2, numElements*sizeof(float));

    srand(1214134);
    for (int i = 0; i < numElements; i++)
    {
        h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
        h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
    }

    int threadsPerBlock = BLOCK_SIZE;
    int numBlocks = numElements/BLOCK_SIZE + 1;

    float* d_data1;
    float* d_data2;

    cudaMalloc((void**)&d_data1, numElements*sizeof(float));
    cudaMalloc((void**)&d_data2, numElements*sizeof(float));

    cudaStream_t stream1;
    cudaStream_t stream2;

    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);

    cudaEvent_t data1IsReady;
    cudaEventCreate(&data1IsReady);

    // Timing
    clock_t start = clock();

    cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1);
    func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements);
    cudaEventRecord(data1IsReady, stream1);
    cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1);
    
    cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2);
    cudaStreamWaitEvent(stream2, data1IsReady);
    func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data1, d_data2, d_data2, numElements);
    cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2);

    cudaDeviceSynchronize();

    // Timing
    clock_t finish = clock();

    printf("The results are:\n");
    for (int i = 0; i < numValuesToPrint; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    printf("...\n");
    for (int i = numElements - numValuesToPrint; i < numElements; i++)
    {
        printf("%f, %f\n", h_data1[i], h_data2[i]);
    }
    double sum1 = 0.0;
    double sum2 = 0.0;
    for (int i = 0; i < numElements; i++)
    {
        sum1 += h_data1[i];
        sum2 += h_data2[i];
    }
    printf("The summs are: %f and %f\n", sum1, sum2);

    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

    // Release the memory
    cudaFreeHost(h_data1);
    cudaFreeHost(h_data2);

    cudaFree(d_data1);
    cudaFree(d_data2);
    
    return 0;
}
Profiling Performance

ADD SOME RESULTS

Vector Examples in CPU/GPU
Profiling Performance

ADD SOME RESULTS

Dynamic Sync
Code Dynamic Sync
#include <cstdio>
#include <cassert>

const int blocksize = 96;

// kernel prototypes
__global__ void k1(void);
__global__ void k2(void);

// array of values to fill
__device__ int data[blocksize];

// kernel that fills the array in device memory
__global__ void k2(void)
{
    int idx = threadIdx.x;
    if (idx < blocksize) {
        data[idx] = idx;
    }
}

// kernel that calls the fill kernel
__global__ void k1(void)
{
    int idx = threadIdx.x;

    if (idx == 0) {
        k2<<<1, blocksize>>>();
        cudaDeviceSynchronize();
    }
    __syncthreads();

    printf("Thread %i has value %i\n", idx, data[idx]);
}

int main(void)
{
    k1<<<1, blocksize>>>();
    cudaDeviceSynchronize();
    return 0;
}
Simple Streams
Code Simple Stream
#include <cstdio>
#include <cmath>
#include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG

// Number of iterations in the kernel
#define ITER_MULTIPLIER 4

// Number of tests, number of streams doubles for each test. That is,
// there will be 2^N_TESTS streams in the last test
#define N_TESTS 4

// Information of stream for simple domain decomposition
struct stream {
    cudaStream_t strm;   // Stream
    int len;             // Length of the part for this stream
    int start;           // Offset to the start of the part
};

// Kernel for vector summation
__global__ void vector_add(double *C, const double *A, const double *B,
                           int N, int iterations)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    // Do not try to access past the allocated memory
    for (int i = idx; i < N; i += stride) {
        C[i] = 0;
        for (int j = 0; j < ITER_MULTIPLIER * iterations; j++) {
            C[i] += A[i] + B[i];
        }
    }
}


// Routine for stream test
void streamtest(double *hC, const double *hA, const double *hB,
                double *dC, const double *dA, const double *dB,
                stream *s, int nstreams, float *gputime, int tib,
                int iterations)
{
    // Add here the needed timing event calls
    cudaEvent_t start, stop;

    CUDA_CHECK( cudaEventCreate(&start) );
    CUDA_CHECK( cudaEventCreate(&stop) );

    CUDA_CHECK( cudaEventRecord(start) );    

    for (int i = 0; i < nstreams; ++i) {
        // Add here the copy - kernel execution - copy sequence
        // for each stream
        //
        // Each stream will copy their own part of the input data
        // to the GPU starting from address &(dA[sidx]).
        // Size of the block is slen (see variables below).
        int sidx = s[i].start;
        int slen = s[i].len;
                                    
        CUDA_CHECK( cudaMemcpyAsync((void *)&(dA[sidx]), (void *)&(hA[sidx]), 
                                    sizeof(double) * slen, 
                                    cudaMemcpyHostToDevice, s[i].strm) );

        CUDA_CHECK( cudaMemcpyAsync((void *)&(dB[sidx]), (void *)&(hB[sidx]),
                                    sizeof(double) * slen, 
                                    cudaMemcpyHostToDevice, s[i].strm) );

        // You can use these values for the grid and block sizes
        dim3 grid, threads;
        grid.x = (slen + tib - 1) / tib;
        threads.x = tib;
        
        vector_add<<<grid, threads, 0, s[i].strm>>>(&(dC[sidx]), &(dA[sidx]),
                                                    &(dB[sidx]), slen, 
                                                    iterations);

        CUDA_CHECK( cudaMemcpyAsync((void *)&(hC[sidx]), (void *)&(dC[sidx]),
                                    sizeof(double) * slen, 
                                    cudaMemcpyDeviceToHost, s[i].strm) );
    }    

    // Add the calls needed for execution timing and compute
    // the elapsed time to the gputime variable
    CUDA_CHECK( cudaEventRecord(stop) );
    CUDA_CHECK( cudaEventSynchronize(stop) );

    CHECK_ERROR_MSG("Stream test failed");

    CUDA_CHECK( cudaEventElapsedTime(gputime, start, stop) );

    CUDA_CHECK( cudaEventDestroy(start) );
    CUDA_CHECK( cudaEventDestroy(stop) );
}

// Routine for default stream reference
void default_stream(double *hC, const double *hA, const double *hB,
                    double *dC, const double *dA, const double *dB,
                    int N, float *gputime, int tib, int iterations)
{
    // Add here the needed timing event calls
    cudaEvent_t start, stop;

    CUDA_CHECK( cudaEventCreate(&start) );
    CUDA_CHECK( cudaEventCreate(&stop) );

    CUDA_CHECK( cudaEventRecord(start) );

    // Non-asynchronous copies
    CUDA_CHECK( cudaMemcpy((void *)dA, (void *)hA,
                            sizeof(double) * N,
                            cudaMemcpyHostToDevice) );

    CUDA_CHECK( cudaMemcpy((void *)dB, (void *)hB,
                            sizeof(double) * N,
                            cudaMemcpyHostToDevice) );

    dim3 grid, threads;
    grid.x = (N + tib - 1) / tib;
    threads.x = tib;

    vector_add<<<grid, threads>>>(dC, dA, dB, N, iterations);

    CUDA_CHECK( cudaMemcpyAsync((void *)hC, (void *)dC,
				sizeof(double) * N,
				cudaMemcpyDeviceToHost) );

    // Add the calls needed for execution timing and compute
    // the elapsed time to the gputime variable
    CUDA_CHECK( cudaEventRecord(stop) );
    CUDA_CHECK( cudaEventSynchronize(stop) );

    CHECK_ERROR_MSG("Default stream test failed");

    CUDA_CHECK( cudaEventElapsedTime(gputime, start, stop) );

    CUDA_CHECK( cudaEventDestroy(start) );
    CUDA_CHECK( cudaEventDestroy(stop) );
}

// Create the streams and compute the decomposition
void create_streams(int nstreams, int vecsize, stream **strm)
{
    *strm = new stream[nstreams];
    stream *s = *strm;
    for(int i = 0; i < nstreams; i++) {
        CUDA_CHECK( cudaStreamCreate(&s[i].strm) );
    }

    s[0].start = 0;
    s[0].len = vecsize / nstreams;
    s[0].len += vecsize % nstreams ? 1 : 0;
    for(int i = 1; i < nstreams; i++) {
        int offset = vecsize / nstreams;
        if(i < vecsize % nstreams) {
            offset++;
        }
        s[i].len = offset;
        s[i].start = s[i-1].start + offset;
    }
}

// Delete the streams
void destroy_streams(int nstreams, stream *s)
{
    for(int i = 0; i < nstreams; i++) {
        CUDA_CHECK( cudaStreamDestroy(s[i].strm) );
    }
    delete[] s;
}



int main(int argc, char *argv[])
{
    const int ThreadsInBlock = 512;
    int iterations;
    double *dA, *dB, *dC;
    double *hA, *hB, *hC;
    double ref_value;
    float gputime_ref, gputimes[N_TESTS];
    stream *s;

    cudaDeviceProp prop;

    if (argc < 2) {
        printf("Usage: %s N\nwhere N is the length of the vector.\n",
                argv[0]);
        exit(EXIT_FAILURE);
    }

    int N = atoi(argv[1]);

    // Determine the number of available multiprocessors on the device.
    // It is used for a coarse adjustment of the computation part of
    // this test.
    cudaGetDeviceProperties(&prop, 0);
    iterations = (prop.multiProcessorCount + 1) / 2;

    // Add here the host memory allocation routines (page-locked)
    CUDA_CHECK( cudaMallocHost((void**)&hA, sizeof(double) * N) );
    CUDA_CHECK( cudaMallocHost((void**)&hB, sizeof(double) * N) );
    CUDA_CHECK( cudaMallocHost((void**)&hC, sizeof(double) * N) );
    
    for(int i = 0; i < N; ++i) {
        hA[i] = 1.0;
        hB[i] = 2.0;
    }

    ref_value = 3.0 * ITER_MULTIPLIER * iterations;

    CUDA_CHECK( cudaMalloc((void**)&dA, sizeof(double) * N) );
    CUDA_CHECK( cudaMalloc((void**)&dB, sizeof(double) * N) );
    CUDA_CHECK( cudaMalloc((void**)&dC, sizeof(double) * N) );
    
    // Check the timings of default stream first
    default_stream(hC, hA, hB, dC, dA, dB, N, &gputime_ref, ThreadsInBlock,
                   iterations);

    // Here we loop over the test. On each iteration, we double the number
    // of streams.
    for(int strm = 0; strm < N_TESTS; strm++) {
        int stream_count = 1<<strm;
        create_streams(stream_count, N, &s);
        streamtest(hC, hA, hB, dC, dA, dB, s, stream_count, &gputimes[strm],
                   ThreadsInBlock, iterations);
        destroy_streams(stream_count, s);
    }

    CUDA_CHECK( cudaFree((void*)dA) );
    CUDA_CHECK( cudaFree((void*)dB) );
    CUDA_CHECK( cudaFree((void*)dC) );

    int errorsum = 0;
    for (int i = 0; i < N; i++) {
        errorsum += hC[i] - ref_value;
    }

    printf("Error sum = %i\n", errorsum);
    printf("Time elapsed for reference run: %f\n", gputime_ref / 1000.);
    for(int i = 0; i < N_TESTS; i++) {
        printf("Time elapsed for test %2i:       %f\n", 1<<i,
                gputimes[i] / 1000.);
    }

    // Add here the correct host memory freeing routines 
    CUDA_CHECK( cudaFreeHost((void*)hA) );
    CUDA_CHECK( cudaFreeHost((void*)hB) );
    CUDA_CHECK( cudaFreeHost((void*)hC) );

    return 0;
}
Jacobi
Code Jacobi
#include <sys/time.h>
#include <cstdio>
#include "jacobi.h"
#include "error_checks.h"

// Change this to 0 if CPU reference result is not needed
#define COMPUTE_CPU_REFERENCE 1
#define MAX_ITERATIONS 3000

// CPU kernel
void sweepCPU(double* phi, const double *phiPrev, const double *source, 
              double h2, int N)
{ 
    int i, j;
    int index, i1, i2, i3, i4;

    for (j = 1; j < N-1; j++) {
        for (i = 1; i < N-1; i++) {
            index = i + j*N; 
            i1 = (i-1) +   j   * N;
            i2 = (i+1) +   j   * N;
            i3 =   i   + (j-1) * N;
            i4 =   i   + (j+1) * N;
            phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 
                                 phiPrev[i3] + phiPrev[i4] - 
                                 h2 * source[index]);
        } 
    }
} 

// GPU kernel
__global__ 
void sweepGPU(double *phi, const double *phiPrev, const double *source, 
              double h2, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int index = i + j*N;
    int i1, i2, i3, i4;

    i1 = (i-1) +   j   * N;
    i2 = (i+1) +   j   * N;
    i3 =   i   + (j-1) * N;
    i4 =   i   + (j+1) * N;

    if (i > 0 && j > 0 && i < N-1 && j < N-1)
        phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 
                             phiPrev[i3] + phiPrev[i4] - 
                             h2 * source[index]);
}


double compareArrays(const double *a, const double *b, int N)
{
    double error = 0.0;
    int i;
    for (i = 0; i < N*N; i++) {
        error += fabs(a[i] - b[i]);
    }
    return error/(N*N);
}


double diffCPU(const double *phi, const double *phiPrev, int N)
{
    int i;
    double sum = 0;
    double diffsum = 0;
    
    for (i = 0; i < N*N; i++) {
        diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]);
        sum += phi[i] * phi[i];
    }

    return sqrt(diffsum/sum);
}


int main() 
{ 
    timeval t1, t2; // Structs for timing
    const int N = 512;
    double h = 1.0 / (N - 1);
    int iterations;
    const double tolerance = 5e-4; // Stopping condition
    int i, j, index;

    const int blocksize = 16;
  
    double *phi      = new double[N*N]; 
    double *phiPrev  = new double[N*N]; 
    double *source   = new double[N*N]; 
    double *phi_cuda = new double[N*N]; 

    double *phi_d, *phiPrev_d, *source_d; 
    // Size of the arrays in bytes
    const int size = N*N*sizeof(double); 
    double diff;
  
    // Source initialization
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {      
            double x, y;
            x = (i - N / 2) * h;
            y = (j - N / 2) * h;
            index = j + i * N;
            if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1)
                source[index] = 1e10*h*h;
            else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1)
                source[index] = -1e10*h*h;
            else
                source[index] = 0.0;
        }            
    }

    CUDA_CHECK( cudaMalloc( (void**)&source_d, size) ); 
    CUDA_CHECK( cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice) ); 

    // Reset values to zero
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {      
            index = j + i * N;
            phi[index] = 0.0; 
            phiPrev[index] = 0.0; 
        }            
    }

    CUDA_CHECK( cudaMalloc( (void**)&phi_d, size) ); 
    CUDA_CHECK( cudaMalloc( (void**)&phiPrev_d, size) ); 
    CUDA_CHECK( cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice) );
    CUDA_CHECK( cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice) );

    // CPU version 
    if(COMPUTE_CPU_REFERENCE) { 
        gettimeofday(&t1, NULL);

        // Do sweeps untill difference is under the tolerance
        diff = tolerance * 2;
        iterations = 0;
        while (diff > tolerance && iterations < MAX_ITERATIONS) {
            sweepCPU(phiPrev, phi, source, h * h, N);
            sweepCPU(phi, phiPrev, source, h * h, N);
            
            iterations += 2;
            if (iterations % 100 == 0) {
                diff = diffCPU(phi, phiPrev, N);
                printf("%d %g\n", iterations, diff);
            }
        }
        gettimeofday(&t2, NULL);
        printf("CPU Jacobi: %g seconds, %d iterations\n", 
               t2.tv_sec - t1.tv_sec + 
               (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
    }

    // GPU version

    dim3 dimBlock(blocksize, blocksize); 
    dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize); 
    
    //do sweeps until diff under tolerance
    diff = tolerance * 2;
    iterations = 0;

    gettimeofday(&t1, NULL);

    while (diff > tolerance && iterations < MAX_ITERATIONS) {
        // See above how the CPU update kernel is called
        // and implement similar calling sequence for the GPU code

        //// Add routines here
        sweepGPU<<<dimGrid, dimBlock>>>(phiPrev_d, phi_d, source_d, h*h, N); 
        sweepGPU<<<dimGrid, dimBlock>>>(phi_d, phiPrev_d, source_d, h*h, N); 
        CHECK_ERROR_MSG("Jacobi kernels");
        iterations += 2;
        
        if (iterations % 100 == 0) {
            // diffGPU is defined in the header file, it uses
            // Thrust library for reduction computation
            diff = diffGPU<double>(phiPrev_d, phi_d, N);
            CHECK_ERROR_MSG("Difference computation");
            printf("%d %g\n", iterations, diff);
        }
    }
    
    CUDA_CHECK( cudaMemcpy(phi_cuda, phi_d, size, cudaMemcpyDeviceToHost) ); 

    gettimeofday(&t2, NULL);
    printf("GPU Jacobi: %g seconds, %d iterations\n", 
           t2.tv_sec - t1.tv_sec + 
           (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);

    //// Add here the clean up code for all allocated CUDA resources
    CUDA_CHECK( cudaFree(phi_d) ); 
    CUDA_CHECK( cudaFree(phiPrev_d) );
    CUDA_CHECK( cudaFree(source_d) ); 

    if (COMPUTE_CPU_REFERENCE) {
        printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N));
    }
    
    delete[] phi; 
    delete[] phi_cuda;
    delete[] phiPrev; 
    delete[] source; 
    
    return EXIT_SUCCESS; 
} 

…​