CUDA Coding Practice
-
Compiling a program for CUDA
-
For example, to compile MyProg.cu you would use a command like
-
nvcc -o MyProg MyProg.cu
-
// Note that in this model we do not check
// the error codes and status of kernel call.
#include <cstdio>
#include <cmath>
__global__ void hello()
{
printf("Greetings from your GPU\n");
}
int main(void)
{
int count, device;
cudaGetDeviceCount(&count);
cudaGetDevice(&device);
printf("You have in total %d GPUs in your system\n", count);
printf("GPU %d will now print a message for you:\n", device);
hello<<<1,1>>>();
cudaDeviceSynchronize();
return 0;
}
-
Array Addition Examples in CPU/GPU
//
// gcc 01_array_addition_cpu.cpp
// nvcc 01_array_addition_cpu.cpp
//
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
const double EPSILON = 1.0E-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void array_addition(const double *vecA, const double *vecB, double *vecC, const int NX);
void array_check(const double *vecC, const int NX);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
const int NX = 1000000;
int size_array = sizeof(double) * NX;
double *vecA = (double *)malloc(size_array);
double *vecB = (double *)malloc(size_array);
double *vecC = (double *)malloc(size_array);
for (int i = 0; i < NX; i++)
{
vecA[i] = a;
vecB[i] = b;
}
array_addition(vecA, vecB, vecC, NX);
array_check(vecC, NX);
free(vecA);
free(vecB);
free(vecC);
printf("\n--Ending of the main function.\n\n");
return 0;
}
void array_addition(const double *vecA, const double *vecB, double *vecC, const int NX)
{
for (int i = 0; i < NX; i++)
vecC[i] = vecA[i] + vecB[i];
}
void array_check(const double *vecC, const int NX)
{
bool has_error = false;
for (int i = 0; i < NX; i++)
{
if (fabs(vecC[i] - c) > EPSILON)
{
has_error = true;
break;
}
}
printf("\n\tChecking array addition results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
//
// nvcc 02_array_addition_gpu.cu
//
#include <math.h>
#include <stdio.h>
const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ array_addition(const double *vecA, const double *vecB, double *vecC);
void array_check(const double *vecC, const int NX);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
const int NX = 25600004; // try 25600000, 25600004 and 25600008
int size_array = sizeof(double) * NX;
double *h_vecA = (double *)malloc(size_array); // 'h' for host (CPU)
double *h_vecB = (double *)malloc(size_array);
double *h_vecC = (double *)malloc(size_array);
for (int i = 0; i < NX; i++)
{
h_vecA[i] = a;
h_vecB[i] = b;
}
double *d_vecA, *d_vecB, *d_vecC; // 'd' for device (GPU)
cudaMalloc((void **)&d_vecA, size_array);
cudaMalloc((void **)&d_vecB, size_array);
cudaMalloc((void **)&d_vecC, size_array);
cudaMemcpy(d_vecA, h_vecA, size_array, cudaMemcpyHostToDevice); // copy data from host(CPU) to device(GPU)
cudaMemcpy(d_vecB, h_vecB, size_array, cudaMemcpyHostToDevice);
const int block_size = 128;
int grid_size = (NX + block_size - 1) / block_size;
printf("\n\tArray_size = %d, grid_size = %d and block_size = %d.\n", NX, grid_size, block_size);
array_addition<<<grid_size, block_size>>>(d_vecA, d_vecB, d_vecC);
cudaMemcpy(h_vecC, d_vecC, size_array, cudaMemcpyDeviceToHost); // copy data from device(GPU) to host(CPU)
array_check(h_vecC, NX);
free(h_vecA);
free(h_vecB);
free(h_vecC);
cudaFree(d_vecA);
cudaFree(d_vecB);
cudaFree(d_vecC);
printf("\n--Ending of the main function.\n\n");
return 0;
}
void __global__ array_addition(const double *vecA, const double *vecB, double *vecC)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
vecC[i] = vecA[i] + vecB[i];
}
void array_check(const double *vecC, const int NX)
{
bool has_error = false;
for (int i = 0; i < NX; i++)
{
if (fabs(vecC[i] - c) > EPSILON)
{
has_error = true;
break;
}
}
printf("\n\tChecking array addition results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
ADD SOME RESULTS
-
Array Reduce Examples in CPU/GPU
A reduction of an array to a single value on a GPU. Data is copied to the GPU memory, where each thread adds one element to the accumulated value. The thread-safe atomic operations have to be used in order to ensure that there are no race conditions. Many threads will run simultaneously on a GPU, so there is no need for a loop over the indices.
//
// nvcc -O3 -DUSE_DP 01_array_reduce_cpu.cu
// nvcc -O3 01_array_reduce_cpu.cu
//
#include "error_checker.cuh"
#include <stdio.h>
#ifdef USE_DP
typedef double real; // for double precision
#else
typedef float real; // for single precision
#endif
const int NUM_REPEATS = 20;
void timing(const real *x, const int NX);
real reduce(const real *x, const int NX);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("\nUsing Device %d: %s\n", dev, deviceProp.name);
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
real *x = (real *) malloc(size_array);
for (int n = 0; n < NX; ++n)
x[n] = 1.23;
timing(x, NX);
free(x);
printf("\n--Ending of the main function.\n\n");
return 0;
}
void timing(const real *x, const int NX)
{
real sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(x, NX);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("\tTime = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("\tSum = %f.\n", sum);
}
real reduce(const real *x, const int NX)
{
real sum = 0.0;
for (int n = 0; n < NX; ++n)
sum += x[n];
return sum;
}
//
// nvcc -O3 -DUSE_DP 02_array_reduce_gpu.cu
// nvcc -O3 02_array_reduce_gpu.cu
//
#include "error_checker.cuh"
#include <stdio.h>
#ifdef USE_DP
typedef double real;
#else
typedef float real;
#endif
const int NUM_REPEATS = 20;
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
const int BLOCK_SIZE = 128;
void timing(real *h_x, real *d_x, const int method);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("\nUsing Device %d: %s\n", dev, deviceProp.name);
real *h_x = (real *)malloc(size_array);
for (int n = 0; n < NX; ++n)
h_x[n] = 1.23;
real *d_x;
CHECK(cudaMalloc(&d_x, size_array));
printf("\n\tUsing global memory only:\n");
timing(h_x, d_x, 0);
printf("\n\tUsing static shared memory:\n");
timing(h_x, d_x, 1);
printf("\n\tUsing dynamic shared memory:\n");
timing(h_x, d_x, 2);
free(h_x);
CHECK(cudaFree(d_x));
printf("\n--Ending of the main function.\n\n");
return 0;
}
void __global__ reduction_global(real *d_x, real *d_y)
{
const int tid = threadIdx.x;
real *x = d_x + blockDim.x * blockIdx.x;
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
x[tid] += x[tid + offset];
__syncthreads();
}
if (tid == 0)
d_y[blockIdx.x] = x[0];
}
void __global__ reduction_static_shared(real *d_x, real *d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
__shared__ real s_y[128];
s_y[tid] = (n < NX) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
s_y[tid] += s_y[tid + offset];
__syncthreads();
}
if (tid == 0)
d_y[bid] = s_y[0];
}
void __global__ reduction_dynamic_shared(real *d_x, real *d_y)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ real s_y[];
s_y[tid] = (n < NX) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
s_y[tid] += s_y[tid + offset];
__syncthreads();
}
if (tid == 0)
d_y[bid] = s_y[0];
}
real reduction(real *d_x, const int method)
{
int grid_size = (NX + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int ymem = sizeof(real) * grid_size;
const int smem = sizeof(real) * BLOCK_SIZE;
real *d_y;
CHECK(cudaMalloc(&d_y, ymem));
real *h_y = (real *) malloc(ymem);
switch (method)
{
case 0:
reduction_global<<<grid_size, BLOCK_SIZE>>>(d_x, d_y); break;
case 1:
reduction_static_shared<<<grid_size, BLOCK_SIZE>>>(d_x, d_y); break;
case 2:
reduction_dynamic_shared<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y); break;
default:
printf("Error: wrong method\n");
exit(1);
break;
}
CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));
real result = 0.0;
for (int n = 0; n < grid_size; ++n)
result += h_y[n];
free(h_y);
CHECK(cudaFree(d_y));
return result;
}
void timing(real *h_x, real *d_x, const int method)
{
real sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
CHECK(cudaMemcpy(d_x, h_x, size_array, cudaMemcpyHostToDevice));
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduction(d_x, method);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("\tTime = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("\tSum = %f.\n", sum);
}
//
// nvcc -O3 -DUSE_DP 03_array_reduce_gpu_atomic.cu
// nvcc -O3 03_array_reduce_gpu_atomic.cu
//
#include "error_checker.cuh"
#include <stdio.h>
#ifdef USE_DP
typedef double real;
#else
typedef float real;
#endif
const int NUM_REPEATS = 20;
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
const int BLOCK_SIZE = 128;
void timing(const real *d_x);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("\nUsing Device %d: %s\n", dev, deviceProp.name);
real *h_x = (real *) malloc(size_array);
for (int n = 0; n < NX; ++n)
h_x[n] = 1.23;
real *d_x;
CHECK(cudaMalloc(&d_x, size_array));
CHECK(cudaMemcpy(d_x, h_x, size_array, cudaMemcpyHostToDevice));
printf("\n\tUsing atomicAdd:\n");
timing(d_x);
free(h_x);
CHECK(cudaFree(d_x));
printf("\n--Ending of the main function.\n\n");
return 0;
}
void __global__ reduce(const real *d_x, real *d_y, const int N)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ real s_y[];
s_y[tid] = (n < NX) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
s_y[tid] += s_y[tid + offset];
__syncthreads();
}
if (tid == 0)
atomicAdd(d_y, s_y[0]);
}
real reduce(const real *d_x)
{
const int grid_size = (NX + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int smem = sizeof(real) * BLOCK_SIZE;
real h_y[1] = {0};
real *d_y;
CHECK(cudaMalloc(&d_y, sizeof(real)));
CHECK(cudaMemcpy(d_y, h_y, sizeof(real), cudaMemcpyHostToDevice));
reduce<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, NX);
CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
CHECK(cudaFree(d_y));
return h_y[0];
}
void timing(const real *d_x)
{
real sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(d_x);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("\tTime = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("\tSum = %f.\n", sum);
}
A reduction of an array to a single value on a GPU. Data is copied to the GPU memory, where each thread adds one element to the accumulated value. Note that the thread-safe atomic operations have to be used in order to ensure that there are no race conditions. Many threads will run simultaneously on a GPU, so there is no need for a loop over the indices.
//
// nvcc -O3 -DUSE_DP 04_array_reduce_gpu_shuffle.cu
// nvcc -O3 04_array_reduce_gpu_shuffle.cu
//
#include "error_checker.cuh"
#include <stdio.h>
#include <cooperative_groups.h>
using namespace cooperative_groups;
#ifdef USE_DP
typedef double real;
#else
typedef float real;
#endif
const int NUM_REPEATS = 20;
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
const int BLOCK_SIZE = 128;
const unsigned FULL_MASK = 0xffffffff;
void timing(const real *d_x, const int method);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("\nUsing Device %d: %s\n", dev, deviceProp.name);
real *h_x = (real *) malloc(size_array);
for (int n = 0; n < NX; ++n)
h_x[n] = 1.23;
real *d_x;
CHECK(cudaMalloc(&d_x, size_array));
CHECK(cudaMemcpy(d_x, h_x, size_array, cudaMemcpyHostToDevice));
printf("\n\tUsing syncwarp:\n");
timing(d_x, 0);
printf("\n\tUsing shuffle:\n");
timing(d_x, 1);
printf("\n\tUsing cooperative group:\n");
timing(d_x, 2);
free(h_x);
CHECK(cudaFree(d_x));
printf("\n--Ending of the main function.\n\n");
return 0;
}
void __global__ reduce_syncwarp(const real *d_x, real *d_y, const int NX)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ real s_y[];
s_y[tid] = (n < NX) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
{
if (tid < offset)
s_y[tid] += s_y[tid + offset];
__syncthreads();
}
for (int offset = 16; offset > 0; offset >>= 1)
{
if (tid < offset)
s_y[tid] += s_y[tid + offset];
__syncwarp();
}
if (tid == 0)
atomicAdd(d_y, s_y[0]);
}
void __global__ reduce_shfl(const real *d_x, real *d_y, const int NX)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ real s_y[];
s_y[tid] = (n < NX) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
{
if (tid < offset)
s_y[tid] += s_y[tid + offset];
__syncthreads();
}
real y = s_y[tid];
for (int offset = 16; offset > 0; offset >>= 1)
y += __shfl_down_sync(FULL_MASK, y, offset);
if (tid == 0)
atomicAdd(d_y, y);
}
void __global__ reduce_cp(const real *d_x, real *d_y, const int NX)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ real s_y[];
s_y[tid] = (n < NX) ? d_x[n] : 0.0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
{
if (tid < offset)
s_y[tid] += s_y[tid + offset];
__syncthreads();
}
real y = s_y[tid];
thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
for (int i = g.size() >> 1; i > 0; i >>= 1)
y += g.shfl_down(y, i);
if (tid == 0)
atomicAdd(d_y, y);
}
real reduce(const real *d_x, const int method)
{
const int grid_size = (NX + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int smem = sizeof(real) * BLOCK_SIZE;
real h_y[1] = {0};
real *d_y;
CHECK(cudaMalloc(&d_y, sizeof(real)));
CHECK(cudaMemcpy(d_y, h_y, sizeof(real), cudaMemcpyHostToDevice));
switch (method)
{
case 0:
reduce_syncwarp<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, NX); break;
case 1:
reduce_shfl<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, NX); break;
case 2:
reduce_cp<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, NX); break;
default:
printf("\tWrong method.\n"); exit(1);
}
CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
CHECK(cudaFree(d_y));
return h_y[0];
}
void timing(const real *d_x, const int method)
{
real sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(d_x, method);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("\tTime = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("\tSum = %f.\n", sum);
}
//
// nvcc -O3 -DUSE_DP 05_array_reduce_gpu_parallelism.cu
// nvcc -O3 05_array_reduce_gpu_parallelism.cu
//
#include "error_checker.cuh"
#include <stdio.h>
#include <cooperative_groups.h>
using namespace cooperative_groups;
#ifdef USE_DP
typedef double real;
#else
typedef float real;
#endif
const int NUM_REPEATS = 20;
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240;
void timing(const real *h_x);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("\nUsing Device %d: %s\n", dev, deviceProp.name);
real *h_x = (real *) malloc(size_array);
for (int n = 0; n < NX; ++n)
h_x[n] = 1.23;
real *d_x;
CHECK(cudaMalloc(&d_x, size_array));
CHECK(cudaMemcpy(d_x, h_x, size_array, cudaMemcpyHostToDevice));
timing(d_x);
free(h_x);
CHECK(cudaFree(d_x));
printf("\n--Ending of the main function.\n\n");
return 0;
}
void __global__ reduce_cp(const real *d_x, real *d_y, const int NX)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
extern __shared__ real s_y[];
real y = 0.0;
const int stride = blockDim.x * gridDim.x;
for (int n = bid * blockDim.x + tid; n < NX; n += stride)
y += d_x[n];
s_y[tid] = y;
__syncthreads();
for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
{
if (tid < offset)
s_y[tid] += s_y[tid + offset];
__syncthreads();
}
y = s_y[tid];
thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
for (int i = g.size() >> 1; i > 0; i >>= 1)
y += g.shfl_down(y, i);
if (tid == 0)
d_y[bid] = y;
}
real reduce(const real *d_x)
{
const int ymem = sizeof(real) * GRID_SIZE;
const int smem = sizeof(real) * BLOCK_SIZE;
real h_y[1] = {0};
real *d_y;
CHECK(cudaMalloc(&d_y, ymem));
reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, NX);
reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);
CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
CHECK(cudaFree(d_y));
return h_y[0];
}
void timing(const real *d_x)
{
real sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(d_x);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("\tTime = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("\tSum = %f.\n", sum);
}
//
// nvcc -O3 -DUSE_DP 06_array_reduce_gpu_static.cu
// nvcc -O3 06_array_reduce_gpu_static.cu
//
#include "error_checker.cuh"
#include <stdio.h>
#include <cooperative_groups.h>
using namespace cooperative_groups;
#ifdef USE_DP
typedef double real;
#else
typedef float real;
#endif
const int NUM_REPEATS = 20;
const int NX = 100000000;
const int size_array = sizeof(real) * NX;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240;
void timing(const real *d_x);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("\nUsing Device %d: %s\n", dev, deviceProp.name);
real *h_x = (real *) malloc(size_array);
for (int n = 0; n < NX; ++n)
h_x[n] = 1.23;
real *d_x;
CHECK(cudaMalloc(&d_x, size_array));
CHECK(cudaMemcpy(d_x, h_x, size_array, cudaMemcpyHostToDevice));
timing(d_x);
free(h_x);
CHECK(cudaFree(d_x));
printf("\n--Ending of the main function.\n\n");
return 0;
}
void __global__ reduce_cp(const real *d_x, real *d_y, const int NX)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
extern __shared__ real s_y[];
real y = 0.0;
const int stride = blockDim.x * gridDim.x;
for (int n = bid * blockDim.x + tid; n < NX; n += stride)
y += d_x[n];
s_y[tid] = y;
__syncthreads();
for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
{
if (tid < offset)
s_y[tid] += s_y[tid + offset];
__syncthreads();
}
y = s_y[tid];
thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
for (int i = g.size() >> 1; i > 0; i >>= 1)
y += g.shfl_down(y, i);
if (tid == 0)
d_y[bid] = y;
}
__device__ real static_y[GRID_SIZE];
real reduce(const real *d_x)
{
real *d_y;
CHECK(cudaGetSymbolAddress((void**)&d_y, static_y));
const int smem = sizeof(real) * BLOCK_SIZE;
reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, NX);
reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);
real h_y[1] = {0};
CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
// CHECK(cudaMemcpyFromSymbol(h_y, static_y, sizeof(real)); // also ok
return h_y[0];
}
void timing(const real *d_x)
{
real sum = 0;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
sum = reduce(d_x);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("\tTime = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
printf("\tSum = %f.\n", sum);
}
ADD SOME RESULTS
-
Matrix SummationExamples in GPU
//
// nvcc 01_GPU_grid_block_thread_info.cu
//
#include <cuda_runtime.h>
#include <stdio.h>
#include "error_checker.cuh"
void __global__ check_grid_block_thread_info_GPU(void);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n\n");
printf("\t***************************************************\n");
printf("\t********** Output for num_element = 1024 **********\n");
printf("\t***************************************************\n\n");
int num_elements = 1024;
printf("\t\tThere are %d data, which can be distributed:\n", num_elements);
// define grid and block structure
dim3 block(1024); // == (block.x = 1024; block.y = 1; block.z = 1;)
dim3 grid((num_elements + block.x - 1) / block.x);
printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);
// reset block
block.x = 512;
grid.x = (num_elements + block.x - 1) / block.x;
printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);
// reset block
block.x = 256;
grid.x = (num_elements + block.x - 1) / block.x;
printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);
// reset block
block.x = 128;
grid.x = (num_elements + block.x - 1) / block.x;
printf("\t\t- grid.x=%d, block.x=%d\n\n", grid.x, block.x);
CHECK(cudaDeviceSynchronize());
printf("\t***************************************************\n");
printf("\t*********** Output for num_element = 16 ***********\n");
printf("\t***************************************************\n\n");
// reset the total number of data element
num_elements = 16;
// reset grid and block structure
block.x = 2;
grid.x = (num_elements + block.x - 1) / block.x;
// check grid and block info from host side
printf("\t\t- CPU output -- grid.x=%d, grid.y=%d, grid.z=%d\n", grid.x, grid.y, grid.z);
printf("\t\t- CPU output -- block.x=%d, block.y=%d, block.z=%d\n", block.x, block.y, block.z);
putchar('\n');
check_grid_block_thread_info_GPU<<<grid, block>>>();
CHECK(cudaDeviceReset());
printf("\n--Ending of the main function.\n\n");
return 0;
}
void __global__ check_grid_block_thread_info_GPU(void)
{
printf("\t\t- GPU output -- gridDim=(%d, %d, %d) blockDim=(%d, %d, %d) blockIdx=(%d, %d, %d) threadIdx=(%d, %d, %d)\n",
gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);
}
//
// nvcc 02_matrix_thread_index_info.cu
//
#include <cstdio>
#include <cuda_runtime.h>
#include "error_checker.cuh"
void initialInt(int *matrix, int nxy);
void printMatrix(int *h_matrixA, const int nx, const int ny);
void __global__ printGPUIdx(int *d_matrixA, const int nx, const int ny);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("\nUsing Device %d: %s\n", dev, deviceProp.name); // Using Device 0: NVIDIA GeForce GT 1030
int nx = 8;
int ny = 6;
int nxy = nx * ny;
int size_matrix = nxy*(sizeof(int));
// malloc host mem
int *h_matrixA;
h_matrixA = (int *)malloc(size_matrix);
initialInt(h_matrixA, nxy);
printMatrix(h_matrixA, nx, ny);
//malloc device mem
int *d_matrixA;
cudaMalloc((void **)&d_matrixA, size_matrix); //
cudaMemcpy(d_matrixA, h_matrixA, size_matrix, cudaMemcpyHostToDevice); // copy data from CPU to GPU
// setup excution configuration
dim3 block(4, 2);
dim3 grid((nx + block.x-1)/block.x, (ny + block.y-1)/block.y);
printf("\ngrid info >>> grid.x=%d grid.y=%d grid.z=%d.\n", grid.x, grid.y, grid.z);
printf("block info >>> block.x=%d block.y=%d block.z=%d.\n\n", block.x, block.y, block.z);
//invoke kernel
printGPUIdx<<<grid, block>>>(d_matrixA, nx, ny);
cudaDeviceSynchronize();
printf("\n");
//free host and device
free(h_matrixA);
cudaFree(d_matrixA);
//reset device
CHECK(cudaDeviceReset());
printf("\n--Ending of the main function.\n\n");
return 0;
}
void initialInt(int *matrix, int nxy){
for(int i = 0; i < nxy; i++)
matrix[i] = i;
}
void printMatrix(int *h_matrixA, const int nx, const int ny){
int *ic = h_matrixA;
printf("\nMatrix:%d, %d\n", nx, ny);
for(int iy = 0; iy < ny; iy++){
for(int ix=0; ix < nx; ix++)
printf("%3d ",ic[ix]);
ic += nx;
printf("\n");
}
}
void __global__ printGPUIdx(int *d_matrixA, const int nx, const int ny){
int ix = threadIdx.x + blockIdx.x*blockDim.x;
int iy = threadIdx.y + blockIdx.y*blockDim.y;
unsigned int idx = iy*nx + ix;
printf("block_id (%d %d) thread_id (%d,%d) coordinate (%d %d) global_index (%d) value (%d)\n",
blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y, ix, iy, idx, d_matrixA[idx]);
}
//
// nvcc 03_matrix_summation_GPU_2D2D_2D1D_1D1D.cu
//
#include <cuda_runtime.h>
#include <stdio.h>
#include "error_checker.cuh"
const double EPSILON = 1.0E-8;
void matrix_initialization(float *ip, const int size);
void matrix_summation_on_CPU(float *A, float *B, float *C, const int, const int);
void check_results_from_CPU_GPU(float *fromCPU, float *fromGPU, const int);
void __global__ matrix_summation_on_GPU_1D1D(float *A, float *B, float *C, int, int);
void __global__ matrix_summation_on_GPU_2D1D(float *A, float *B, float *C, int, int);
void __global__ matrix_summation_on_GPU_2D2D(float *A, float *B, float *C, int, int);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("\n\tUsing Device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
// set up data size of matrix
int nx = 1 << 10;
int ny = 1 << 10;
int nxy = nx * ny;
int size_matrix = nxy * sizeof(float);
printf("\n\tMatrix size: nx=%d ny=%d\n", nx, ny);
// malloc host memory
float *h_matrixA, *h_matrixB, *h_matrixSumFromCPU, *h_matrixSumFromGPU;
h_matrixA = (float *)malloc(size_matrix);
h_matrixB = (float *)malloc(size_matrix);
h_matrixSumFromCPU = (float *)malloc(size_matrix);
h_matrixSumFromGPU = (float *)malloc(size_matrix);
// initialize data at host side and define a timer
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaEventQuery(start);
matrix_initialization(h_matrixA, nxy);
matrix_initialization(h_matrixB, nxy);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float elapsed_time;
cudaEventElapsedTime(&elapsed_time, start, stop); //CHECK();
printf("\tMatrix initialization on host(CPU) elapsed %f sec\n", elapsed_time);
memset(h_matrixSumFromCPU, 0, size_matrix);
memset(h_matrixSumFromGPU, 0, size_matrix);
// summation of matrix elements at host(CPU) side
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaEventQuery(start);
matrix_summation_on_CPU(h_matrixA, h_matrixB, h_matrixSumFromCPU, nx, ny);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time, start, stop);
printf("\tMatrix summation on host(CPU) elapsed %f sec\n", elapsed_time);
// malloc device global memory
float *d_matrixA, *d_matrixB, *d_matrixC;
CHECK(cudaMalloc((void **)&d_matrixA, size_matrix));
CHECK(cudaMalloc((void **)&d_matrixB, size_matrix));
CHECK(cudaMalloc((void **)&d_matrixC, size_matrix));
// transfer data from host to device
CHECK(cudaMemcpy(d_matrixA, h_matrixA, size_matrix, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_matrixB, h_matrixB, size_matrix, cudaMemcpyHostToDevice));
//---------------
// invoke kernel at host side for summation on GPU using 2D_grid and 2D_block
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy); // (32, 32, 1)
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); //(32, 32, 1)
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaEventQuery(start);
matrix_summation_on_GPU_2D2D<<<grid, block>>>(d_matrixA, d_matrixB, d_matrixC, nx, ny);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time, start, stop);
printf("\n\tMatrix summation on GPU (2D_grid 2D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n",
grid.x, grid.y, block.x, block.y, elapsed_time);
cudaGetLastError(); // check kernel error
// copy kernel result back to host side
cudaMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, cudaMemcpyDeviceToHost);
// comparison of computation results
check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------
// invoke kernel at host side for summation on GPU using 2D_grid and 1D_block
dimy = 1;
block.y = dimy; // block (32, 1, 1)
grid.x = (nx + block.x - 1) / block.x;
grid.y = ny; // grid (32, 1024, 1)
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaEventQuery(start);
matrix_summation_on_GPU_2D1D<<<grid, block>>>(d_matrixA, d_matrixB, d_matrixC, nx, ny);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time, start, stop);
printf("\n\tMatrix summation on GPU (2D_grid 1D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n",
grid.x, grid.y, block.x, block.y, elapsed_time);
cudaGetLastError(); // check kernel error
// copy kernel result back to host side
cudaMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, cudaMemcpyDeviceToHost);
// comparison of computation results
check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------
// invoke kernel at host side for summation on GPU using 1D_grid and 1D_block
dimy = 1;
block.y = dimy; // block (32, 1, 1)
grid.x = (nx + block.x - 1) / block.x;
grid.y = 1; // grid (32, 1, 1)
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaEventQuery(start);
matrix_summation_on_GPU_1D1D<<<grid, block>>>(d_matrixA, d_matrixB, d_matrixC, nx, ny);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time, start, stop);
printf("\n\tMatrix summation on GPU (1D_grid 1D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n",
grid.x, grid.y, block.x, block.y, elapsed_time);
cudaGetLastError(); // check kernel error
// copy kernel result back to host side
cudaMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, cudaMemcpyDeviceToHost);
// comparison of computation results
check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------
// destroy start and stop events
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
// free host memory and device global memory
free(h_matrixA);
free(h_matrixB);
free(h_matrixSumFromCPU);
free(h_matrixSumFromGPU);
cudaFree(d_matrixA);
cudaFree(d_matrixB);
cudaFree(d_matrixC);
CHECK(cudaDeviceReset());
printf("\n--Ending of the main function.\n\n");
return 0;
}
void matrix_initialization(float *ip, const int size)
{
for(int i = 0; i < size; i++)
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
void matrix_summation_on_CPU(float *matrixA, float *matrixB, float *matrixC,
const int nx, const int ny)
{
float *ia = matrixA;
float *ib = matrixB;
float *ic = matrixC;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
ic[ix] = ia[ix] + ib[ix];
ia += nx;
ib += nx;
ic += nx;
}
}
void __global__ matrix_summation_on_GPU_2D2D(float *matrixA, float *matrixB,
float *matrixC, int nx, int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny)
matrixC[idx] = matrixA[idx] + matrixB[idx];
}
void __global__ matrix_summation_on_GPU_2D1D(float *matrixA, float *matrixB,
float *matrixC, int nx, int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = blockIdx.y;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny)
matrixC[idx] = matrixA[idx] + matrixB[idx];
}
void __global__ matrix_summation_on_GPU_1D1D(float *matrixA, float *matrixB,
float *matrixC, int nx, int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
if (ix < nx)
for (int iy = 0; iy < ny; iy++)
{
int idx = iy * nx + ix;
matrixC[idx] = matrixA[idx] + matrixB[idx];
}
}
void check_results_from_CPU_GPU(float *h_matrixSumFromCPU,
float *h_matrixSumFromGPU, const int N)
{
bool has_error = false;
for (int i = 0; i < N; i++)
{
if (abs(h_matrixSumFromCPU[i] - h_matrixSumFromGPU[i]) > EPSILON)
{
has_error = true;
printf("host %f gpu %f\n", h_matrixSumFromCPU[i], h_matrixSumFromGPU[i]);
break;
}
}
printf("\tChecking matrix summation results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
ADD SOME RESULTS
-
Parallel reduction Examples in CPU/GPU
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;
printf("Reducing over %d values.\n", numElements);
float* data = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
data[i] = float(rand())/float(RAND_MAX + 1.0);
}
float result = 0.0;
// Timing
clock_t start = clock();
// Main loop
for (int i = 0; i < numElements; i++)
{
result = result + data[i];
}
// Timing
clock_t finish = clock();
printf("The result is: %f\n", result);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(data);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
void reduce(const float* data, float* result, int numElements)
{
for (int i = 0; i < numElements/2; i++)
{
result[i] = data[2*i] + data[2*i + 1];
}
if (numElements % 2 != 0)
{
result[0] += data[numElements-1];
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;
printf("Reducing over %d values.\n", numElements);
float* data = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
data[i] = float(rand())/float(RAND_MAX + 1.0);
}
float* result = (float*)calloc(numElements, sizeof(float));
// Timing
clock_t start = clock();
reduce(data, result, numElements);
// Main loop
for (int numElementsCurrent = numElements/2; numElementsCurrent > 1; numElementsCurrent /= 2)
{
reduce(result, result, numElementsCurrent);
}
// Timing
clock_t finish = clock();
printf("The result is: %f\n", result[0]);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(data);
free(result);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
atomicAdd(result, data[i]);
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;
printf("Reducing over %d values.\n", numElements);
float* h_data = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data[i] = float(rand())/float(RAND_MAX + 1.0);
}
float h_result = 0.0;
float* d_data;
cudaMalloc((void**)&d_data, numElements*sizeof(float));
cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;
float* d_result;
cudaMalloc((void**)&d_result, 1*sizeof(float));
cudaMemset(d_result, 0.0, 1);
// Timing
clock_t start = clock();
// Call the reduction kernel
reduce_kernel<<<numBlocks, threadsPerBlock>>>(d_data, d_result, numElements);
cudaMemcpy(&h_result, d_result, 1*sizeof(float), cudaMemcpyDeviceToHost);
// Timing
clock_t finish = clock();
printf("The result is: %f\n", h_result);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(h_data);
cudaFree(d_data);
cudaFree(d_result);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
if(index < numElements)
{
return data[index];
}
else
{
return 0.0f;
}
}
__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
int d_i = threadIdx.x + blockIdx.x*blockDim.x;
result[d_i] = getValue(data, 2*d_i, numElements) + getValue(data, 2*d_i + 1, numElements);
if (d_i == 0 && numElements % 2 != 0)
{
result[d_i] += data[numElements-1];
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;
printf("Reducing over %d values.\n", numElements);
float* h_data = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data[i] = float(rand())/float(RAND_MAX + 1.0);
}
float h_result = 0.0;
float* d_data;
cudaMalloc((void**)&d_data, numElements*sizeof(float));
cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/2/BLOCK_SIZE + 1;
float* d_result1;
float* d_result2;
cudaMalloc((void**)&d_result1, numElements*sizeof(float));
cudaMalloc((void**)&d_result2, numElements*sizeof(float));
// Timing
clock_t start = clock();
// Main loop
reduce_kernel<<<numBlocks, threadsPerBlock>>>(d_data, d_result1, numElements);
for (int numElementsCurrent = numElements/2; numElementsCurrent > 1; numElementsCurrent = numElementsCurrent/2)
{
int numBlocksCurrent = numElementsCurrent/2/BLOCK_SIZE + 1;
reduce_kernel<<<numBlocksCurrent, threadsPerBlock>>>(d_result1, d_result2, numElementsCurrent);
std::swap(d_result1, d_result2);
}
cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);
// Timing
clock_t finish = clock();
printf("The result is: %f\n", h_result);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(h_data);
cudaFree(d_data);
cudaFree(d_result1);
cudaFree(d_result2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
if(index < numElements)
{
return data[index];
}
else
{
return 0.0f;
}
}
__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
extern __shared__ float s_data[];
int s_i = threadIdx.x;
int d_i = threadIdx.x + blockIdx.x*blockDim.x;
s_data[s_i] = getValue(data, d_i, numElements);
for (int offset = 1; offset < blockDim.x; offset *= 2)
{
__syncthreads();
if (s_i % (2*offset) == 0)
{
s_data[s_i] += s_data[s_i + offset];
}
}
if (s_i == 0)
{
result[blockIdx.x] = s_data[0];
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;
printf("Reducing over %d values.\n", numElements);
float* h_data = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data[i] = float(rand())/float(RAND_MAX + 1.0);
}
float h_result = 0.0;
float* d_data;
cudaMalloc((void**)&d_data, numElements*sizeof(float));
cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;
float* d_result1;
float* d_result2;
cudaMalloc((void**)&d_result1, numBlocks*sizeof(float));
cudaMalloc((void**)&d_result2, numBlocks*sizeof(float));
// Timing
clock_t start = clock();
// Main loop
reduce_kernel<<<numBlocks, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_data, d_result1, numElements);
for (int numElementsCurrent = numBlocks; numElementsCurrent > 1; )
{
int numBlocksCurrent = numElementsCurrent/BLOCK_SIZE + 1;
reduce_kernel<<<numBlocksCurrent, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_result1, d_result2, numElementsCurrent);
numElementsCurrent = numBlocksCurrent;
std::swap(d_result1, d_result2);
}
cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);
// Timing
clock_t finish = clock();
printf("The result is: %f\n", h_result);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(h_data);
cudaFree(d_data);
cudaFree(d_result1);
cudaFree(d_result2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
if(index < numElements)
{
return data[index];
}
else
{
return 0.0f;
}
}
__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
extern __shared__ float s_data[];
int s_i = threadIdx.x;
int d_i = threadIdx.x + blockIdx.x*blockDim.x;
s_data[s_i] = getValue(data, d_i, numElements);
for (int offset = 1; offset < blockDim.x; offset *= 2)
{
__syncthreads();
int index = 2 * offset * s_i;
if (index < blockDim.x)
{
s_data[index] += s_data[index + offset];
}
}
if (s_i == 0)
{
result[blockIdx.x] = s_data[0];
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;
printf("Reducing over %d values.\n", numElements);
float* h_data = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data[i] = float(rand())/float(RAND_MAX + 1.0);
}
float h_result = 0.0;
float* d_data;
cudaMalloc((void**)&d_data, numElements*sizeof(float));
cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;
float* d_result1;
float* d_result2;
cudaMalloc((void**)&d_result1, numBlocks*sizeof(float));
cudaMalloc((void**)&d_result2, numBlocks*sizeof(float));
// Timing
clock_t start = clock();
// Main loop
reduce_kernel<<<numBlocks, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_data, d_result1, numElements);
for (int numElementsCurrent = numBlocks; numElementsCurrent > 1; )
{
int numBlocksCurrent = numElementsCurrent/BLOCK_SIZE + 1;
reduce_kernel<<<numBlocksCurrent, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_result1, d_result2, numElementsCurrent);
numElementsCurrent = numBlocksCurrent;
std::swap(d_result1, d_result2);
}
cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);
// Timing
clock_t finish = clock();
printf("The result is: %f\n", h_result);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(h_data);
cudaFree(d_data);
cudaFree(d_result1);
cudaFree(d_result2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
if(index < numElements)
{
return data[index];
}
else
{
return 0.0f;
}
}
__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
extern __shared__ float s_data[];
int s_i = threadIdx.x;
int d_i = threadIdx.x + blockIdx.x*blockDim.x;
s_data[s_i] = getValue(data, d_i, numElements);
for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
{
__syncthreads();
if (s_i < offset)
{
s_data[s_i] += s_data[s_i + offset];
}
}
if (s_i == 0)
{
result[blockIdx.x] = s_data[0];
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;
printf("Reducing over %d values.\n", numElements);
float* h_data = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data[i] = float(rand())/float(RAND_MAX + 1.0);
}
float h_result = 0.0;
float* d_data;
cudaMalloc((void**)&d_data, numElements*sizeof(float));
cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;
float* d_result1;
float* d_result2;
cudaMalloc((void**)&d_result1, numBlocks*sizeof(float));
cudaMalloc((void**)&d_result2, numBlocks*sizeof(float));
// Timing
clock_t start = clock();
// Main loop
reduce_kernel<<<numBlocks, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_data, d_result1, numElements);
for (int numElementsCurrent = numBlocks; numElementsCurrent > 1; )
{
int numBlocksCurrent = numElementsCurrent/BLOCK_SIZE + 1;
reduce_kernel<<<numBlocksCurrent, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_result1, d_result2, numElementsCurrent);
numElementsCurrent = numBlocksCurrent;
std::swap(d_result1, d_result2);
}
cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);
// Timing
clock_t finish = clock();
printf("The result is: %f\n", h_result);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(h_data);
cudaFree(d_data);
cudaFree(d_result1);
cudaFree(d_result2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
if(index < numElements)
{
return data[index];
}
else
{
return 0.0f;
}
}
__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
extern __shared__ float s_data[];
int s_i = threadIdx.x;
int d_i = threadIdx.x + blockIdx.x*2*blockDim.x;
s_data[s_i] = getValue(data, d_i, numElements) + getValue(data, d_i + blockDim.x, numElements);
for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
{
__syncthreads();
if (s_i < offset)
{
s_data[s_i] += s_data[s_i + offset];
}
}
if (s_i == 0)
{
result[blockIdx.x] = s_data[0];
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;
printf("Reducing over %d values.\n", numElements);
float* h_data = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data[i] = float(rand())/float(RAND_MAX + 1.0);
}
float h_result = 0.0;
float* d_data;
cudaMalloc((void**)&d_data, numElements*sizeof(float));
cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/2/BLOCK_SIZE + 1;
float* d_result1;
float* d_result2;
cudaMalloc((void**)&d_result1, numBlocks*sizeof(float));
cudaMalloc((void**)&d_result2, numBlocks*sizeof(float));
// Timing
clock_t start = clock();
// Main loop
reduce_kernel<<<numBlocks, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_data, d_result1, numElements);
for (int numElementsCurrent = numBlocks; numElementsCurrent > 1; )
{
int numBlocksCurrent = numElementsCurrent/2/BLOCK_SIZE + 1;
reduce_kernel<<<numBlocksCurrent, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_result1, d_result2, numElementsCurrent);
numElementsCurrent = numBlocksCurrent;
std::swap(d_result1, d_result2);
}
cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);
// Timing
clock_t finish = clock();
printf("The result is: %f\n", h_result);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(h_data);
cudaFree(d_data);
cudaFree(d_result1);
cudaFree(d_result2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
__device__ __forceinline__ float getValue(const float* data, int index, int numElements)
{
if(index < numElements)
{
return data[index];
}
else
{
return 0.0f;
}
}
__device__ void warpReduce(volatile float* sdata, int s_i)
{
sdata[s_i] += sdata[s_i + 32];
sdata[s_i] += sdata[s_i + 16];
sdata[s_i] += sdata[s_i + 8];
sdata[s_i] += sdata[s_i + 4];
sdata[s_i] += sdata[s_i + 2];
sdata[s_i] += sdata[s_i + 1];
}
__global__ void reduce_kernel(const float* data, float* result, int numElements)
{
extern __shared__ float s_data[];
int s_i = threadIdx.x;
int d_i = threadIdx.x + blockIdx.x*2*blockDim.x;
s_data[s_i] = getValue(data, d_i, numElements) + getValue(data, d_i + blockDim.x, numElements);
for (int offset = blockDim.x / 2; offset > 32; offset >>= 1)
{
__syncthreads();
if (s_i < offset)
{
s_data[s_i] += s_data[s_i + offset];
}
}
if (s_i < 32)
{
warpReduce(s_data, s_i);
}
if (s_i == 0)
{
result[blockIdx.x] = s_data[0];
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 100000000;
printf("Reducing over %d values.\n", numElements);
float* h_data = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data[i] = float(rand())/float(RAND_MAX + 1.0);
}
float h_result = 0.0;
float* d_data;
cudaMalloc((void**)&d_data, numElements*sizeof(float));
cudaMemcpy(d_data, h_data, numElements*sizeof(float), cudaMemcpyHostToDevice);
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/2/BLOCK_SIZE + 1;
float* d_result1;
float* d_result2;
cudaMalloc((void**)&d_result1, numBlocks*sizeof(float));
cudaMalloc((void**)&d_result2, numBlocks*sizeof(float));
// Timing
clock_t start = clock();
// Main loop
reduce_kernel<<<numBlocks, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_data, d_result1, numElements);
for (int numElementsCurrent = numBlocks; numElementsCurrent > 1; )
{
int numBlocksCurrent = numElementsCurrent/2/BLOCK_SIZE + 1;
reduce_kernel<<<numBlocksCurrent, threadsPerBlock, threadsPerBlock*sizeof(float)>>>(d_result1, d_result2, numElementsCurrent);
numElementsCurrent = numBlocksCurrent;
std::swap(d_result1, d_result2);
}
cudaMemcpy(&h_result, d_result1, 1*sizeof(float), cudaMemcpyDeviceToHost);
// Timing
clock_t finish = clock();
printf("The result is: %f\n", h_result);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(h_data);
cudaFree(d_data);
cudaFree(d_result1);
cudaFree(d_result2);
return 0;
}
ADD SOME RESULTS
-
Task Parallelism Examples in CPU/GPU
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <math.h>
#include <time.h>
static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;
void func1(const float* in, float* out, int numElements)
{
for (int i = 0; i < numElements; i++)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = std::sin(value);
}
out[i] = value;
}
}
void func2(const float* in, float* out, int numElements)
{
for (int i = 0; i < numElements; i++)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = -std::sin(value);
}
out[i] = value;
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
printf("Transforming %d values.\n", numElements);
float* data1 = (float*)calloc(numElements, sizeof(float));
float* data2 = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
data1[i] = float(rand())/float(RAND_MAX + 1.0);
data2[i] = float(rand())/float(RAND_MAX + 1.0);
}
// Timing
clock_t start = clock();
func1(data1, data1, numElements);
func2(data2, data2, numElements);
// Timing
clock_t finish = clock();
printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", data1[i], data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", data1[i], data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += data1[i];
sum2 += data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(data1);
free(data2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;
__global__ void func1_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = sinf(value);
}
out[i] = value;
}
}
__global__ void func2_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = -sinf(value);
}
out[i] = value;
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
printf("Transforming %d values.\n", numElements);
float* h_data1 = (float*)calloc(numElements, sizeof(float));
float* h_data2 = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
}
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;
float* d_data1;
float* d_data2;
cudaMalloc((void**)&d_data1, numElements*sizeof(float));
cudaMalloc((void**)&d_data2, numElements*sizeof(float));
// Timing
clock_t start = clock();
cudaMemcpy(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice);
func1_kernel<<<numBlocks, threadsPerBlock>>>(d_data1, d_data1, numElements);
cudaMemcpy(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice);
func2_kernel<<<numBlocks, threadsPerBlock>>>(d_data2, d_data2, numElements);
cudaMemcpy(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost);
// Timing
clock_t finish = clock();
printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += h_data1[i];
sum2 += h_data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(h_data1);
free(h_data2);
cudaFree(d_data1);
cudaFree(d_data2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;
__global__ void func1_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = sinf(value);
}
out[i] = value;
}
}
__global__ void func2_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = -sinf(value);
}
out[i] = value;
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
printf("Transforming %d values.\n", numElements);
float* h_data1;
float* h_data2;
cudaMallocHost((void**)&h_data1, numElements*sizeof(float));
cudaMallocHost((void**)&h_data2, numElements*sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
}
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;
float* d_data1;
float* d_data2;
cudaMalloc((void**)&d_data1, numElements*sizeof(float));
cudaMalloc((void**)&d_data2, numElements*sizeof(float));
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
// Timing
clock_t start = clock();
cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1);
func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements);
cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2);
func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data2, d_data2, numElements);
cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2);
cudaDeviceSynchronize();
// Timing
clock_t finish = clock();
printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += h_data1[i];
sum2 += h_data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
cudaFreeHost(h_data1);
cudaFreeHost(h_data2);
cudaFree(d_data1);
cudaFree(d_data2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <math.h>
#include <time.h>
static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;
void func1(const float* in, float* out, int numElements)
{
for (int i = 0; i < numElements; i++)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = std::sin(value);
}
out[i] = value;
}
}
void func2(const float* in1, const float* in2, float* out, int numElements)
{
for (int i = 0; i < numElements; i++)
{
float value1 = in1[numElements - i - 1];
float value2 = in2[i];
for (int iter = 0; iter < numIterations; iter++)
{
value2 = -std::sin(value2);
}
out[i] = value1 + value2;
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
printf("Transforming %d values.\n", numElements);
float* data1 = (float*)calloc(numElements, sizeof(float));
float* data2 = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
data1[i] = float(rand())/float(RAND_MAX + 1.0);
data2[i] = float(rand())/float(RAND_MAX + 1.0);
}
// Timing
clock_t start = clock();
func1(data1, data1, numElements);
func2(data1, data2, data2, numElements);
// Timing
clock_t finish = clock();
printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", data1[i], data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", data1[i], data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += data1[i];
sum2 += data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(data1);
free(data2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;
__global__ void func1_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = sinf(value);
}
out[i] = value;
}
}
__global__ void func2_kernel(const float* in1, const float* in2, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value1 = in1[numElements - i - 1];
float value2 = in2[i];
for (int iter = 0; iter < numIterations; iter++)
{
value2 = -sinf(value2);
}
out[i] = value1 + value2;
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
printf("Transforming %d values.\n", numElements);
float* h_data1 = (float*)calloc(numElements, sizeof(float));
float* h_data2 = (float*)calloc(numElements, sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
}
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;
float* d_data1;
float* d_data2;
cudaMalloc((void**)&d_data1, numElements*sizeof(float));
cudaMalloc((void**)&d_data2, numElements*sizeof(float));
// Timing
clock_t start = clock();
cudaMemcpy(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice);
func1_kernel<<<numBlocks, threadsPerBlock>>>(d_data1, d_data1, numElements);
cudaMemcpy(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice);
func2_kernel<<<numBlocks, threadsPerBlock>>>(d_data1, d_data2, d_data2, numElements);
cudaMemcpy(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost);
// Timing
clock_t finish = clock();
printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += h_data1[i];
sum2 += h_data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
free(h_data1);
free(h_data2);
cudaFree(d_data1);
cudaFree(d_data2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;
__global__ void func1_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = sinf(value);
}
out[i] = value;
}
}
__global__ void func2_kernel(const float* in1, const float* in2, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value1 = in1[numElements - i - 1];
float value2 = in2[i];
for (int iter = 0; iter < numIterations; iter++)
{
value2 = -sinf(value2);
}
out[i] = value1 + value2;
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
printf("Transforming %d values.\n", numElements);
float* h_data1;
float* h_data2;
cudaMallocHost((void**)&h_data1, numElements*sizeof(float));
cudaMallocHost((void**)&h_data2, numElements*sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
}
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;
float* d_data1;
float* d_data2;
cudaMalloc((void**)&d_data1, numElements*sizeof(float));
cudaMalloc((void**)&d_data2, numElements*sizeof(float));
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
// Timing
clock_t start = clock();
cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1);
func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements);
cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2);
func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data1, d_data2, d_data2, numElements);
cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2);
cudaDeviceSynchronize();
// Timing
clock_t finish = clock();
printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += h_data1[i];
sum2 += h_data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
cudaFreeHost(h_data1);
cudaFreeHost(h_data2);
cudaFree(d_data1);
cudaFree(d_data2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;
__global__ void func1_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = sinf(value);
}
out[i] = value;
}
}
__global__ void func2_kernel(const float* in1, const float* in2, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value1 = in1[numElements - i - 1];
float value2 = in2[i];
for (int iter = 0; iter < numIterations; iter++)
{
value2 = -sinf(value2);
}
out[i] = value1 + value2;
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
printf("Transforming %d values.\n", numElements);
float* h_data1;
float* h_data2;
cudaMallocHost((void**)&h_data1, numElements*sizeof(float));
cudaMallocHost((void**)&h_data2, numElements*sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
}
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;
float* d_data1;
float* d_data2;
cudaMalloc((void**)&d_data1, numElements*sizeof(float));
cudaMalloc((void**)&d_data2, numElements*sizeof(float));
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
// Timing
clock_t start = clock();
cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2);
func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements);
cudaDeviceSynchronize();
func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data1, d_data2, d_data2, numElements);
cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2);
cudaDeviceSynchronize();
// Timing
clock_t finish = clock();
printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += h_data1[i];
sum2 += h_data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
cudaFreeHost(h_data1);
cudaFreeHost(h_data2);
cudaFree(d_data1);
cudaFree(d_data2);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>
#define BLOCK_SIZE 256
static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;
__global__ void func1_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = sinf(value);
}
out[i] = value;
}
}
__global__ void func2_kernel(const float* in1, const float* in2, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value1 = in1[numElements - i - 1];
float value2 = in2[i];
for (int iter = 0; iter < numIterations; iter++)
{
value2 = -sinf(value2);
}
out[i] = value1 + value2;
}
}
int main(int argc, char* argv[])
{
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
printf("Transforming %d values.\n", numElements);
float* h_data1;
float* h_data2;
cudaMallocHost((void**)&h_data1, numElements*sizeof(float));
cudaMallocHost((void**)&h_data2, numElements*sizeof(float));
srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
}
int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;
float* d_data1;
float* d_data2;
cudaMalloc((void**)&d_data1, numElements*sizeof(float));
cudaMalloc((void**)&d_data2, numElements*sizeof(float));
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaEvent_t data1IsReady;
cudaEventCreate(&data1IsReady);
// Timing
clock_t start = clock();
cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1);
func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements);
cudaEventRecord(data1IsReady, stream1);
cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2);
cudaStreamWaitEvent(stream2, data1IsReady);
func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data1, d_data2, d_data2, numElements);
cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2);
cudaDeviceSynchronize();
// Timing
clock_t finish = clock();
printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += h_data1[i];
sum2 += h_data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
// Release the memory
cudaFreeHost(h_data1);
cudaFreeHost(h_data2);
cudaFree(d_data1);
cudaFree(d_data2);
return 0;
}
ADD SOME RESULTS
ADD SOME RESULTS
#include <cstdio>
#include <cassert>
const int blocksize = 96;
// kernel prototypes
__global__ void k1(void);
__global__ void k2(void);
// array of values to fill
__device__ int data[blocksize];
// kernel that fills the array in device memory
__global__ void k2(void)
{
int idx = threadIdx.x;
if (idx < blocksize) {
data[idx] = idx;
}
}
// kernel that calls the fill kernel
__global__ void k1(void)
{
int idx = threadIdx.x;
if (idx == 0) {
k2<<<1, blocksize>>>();
cudaDeviceSynchronize();
}
__syncthreads();
printf("Thread %i has value %i\n", idx, data[idx]);
}
int main(void)
{
k1<<<1, blocksize>>>();
cudaDeviceSynchronize();
return 0;
}
#include <cstdio>
#include <cmath>
#include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG
// Number of iterations in the kernel
#define ITER_MULTIPLIER 4
// Number of tests, number of streams doubles for each test. That is,
// there will be 2^N_TESTS streams in the last test
#define N_TESTS 4
// Information of stream for simple domain decomposition
struct stream {
cudaStream_t strm; // Stream
int len; // Length of the part for this stream
int start; // Offset to the start of the part
};
// Kernel for vector summation
__global__ void vector_add(double *C, const double *A, const double *B,
int N, int iterations)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Do not try to access past the allocated memory
for (int i = idx; i < N; i += stride) {
C[i] = 0;
for (int j = 0; j < ITER_MULTIPLIER * iterations; j++) {
C[i] += A[i] + B[i];
}
}
}
// Routine for stream test
void streamtest(double *hC, const double *hA, const double *hB,
double *dC, const double *dA, const double *dB,
stream *s, int nstreams, float *gputime, int tib,
int iterations)
{
// Add here the needed timing event calls
cudaEvent_t start, stop;
CUDA_CHECK( cudaEventCreate(&start) );
CUDA_CHECK( cudaEventCreate(&stop) );
CUDA_CHECK( cudaEventRecord(start) );
for (int i = 0; i < nstreams; ++i) {
// Add here the copy - kernel execution - copy sequence
// for each stream
//
// Each stream will copy their own part of the input data
// to the GPU starting from address &(dA[sidx]).
// Size of the block is slen (see variables below).
int sidx = s[i].start;
int slen = s[i].len;
CUDA_CHECK( cudaMemcpyAsync((void *)&(dA[sidx]), (void *)&(hA[sidx]),
sizeof(double) * slen,
cudaMemcpyHostToDevice, s[i].strm) );
CUDA_CHECK( cudaMemcpyAsync((void *)&(dB[sidx]), (void *)&(hB[sidx]),
sizeof(double) * slen,
cudaMemcpyHostToDevice, s[i].strm) );
// You can use these values for the grid and block sizes
dim3 grid, threads;
grid.x = (slen + tib - 1) / tib;
threads.x = tib;
vector_add<<<grid, threads, 0, s[i].strm>>>(&(dC[sidx]), &(dA[sidx]),
&(dB[sidx]), slen,
iterations);
CUDA_CHECK( cudaMemcpyAsync((void *)&(hC[sidx]), (void *)&(dC[sidx]),
sizeof(double) * slen,
cudaMemcpyDeviceToHost, s[i].strm) );
}
// Add the calls needed for execution timing and compute
// the elapsed time to the gputime variable
CUDA_CHECK( cudaEventRecord(stop) );
CUDA_CHECK( cudaEventSynchronize(stop) );
CHECK_ERROR_MSG("Stream test failed");
CUDA_CHECK( cudaEventElapsedTime(gputime, start, stop) );
CUDA_CHECK( cudaEventDestroy(start) );
CUDA_CHECK( cudaEventDestroy(stop) );
}
// Routine for default stream reference
void default_stream(double *hC, const double *hA, const double *hB,
double *dC, const double *dA, const double *dB,
int N, float *gputime, int tib, int iterations)
{
// Add here the needed timing event calls
cudaEvent_t start, stop;
CUDA_CHECK( cudaEventCreate(&start) );
CUDA_CHECK( cudaEventCreate(&stop) );
CUDA_CHECK( cudaEventRecord(start) );
// Non-asynchronous copies
CUDA_CHECK( cudaMemcpy((void *)dA, (void *)hA,
sizeof(double) * N,
cudaMemcpyHostToDevice) );
CUDA_CHECK( cudaMemcpy((void *)dB, (void *)hB,
sizeof(double) * N,
cudaMemcpyHostToDevice) );
dim3 grid, threads;
grid.x = (N + tib - 1) / tib;
threads.x = tib;
vector_add<<<grid, threads>>>(dC, dA, dB, N, iterations);
CUDA_CHECK( cudaMemcpyAsync((void *)hC, (void *)dC,
sizeof(double) * N,
cudaMemcpyDeviceToHost) );
// Add the calls needed for execution timing and compute
// the elapsed time to the gputime variable
CUDA_CHECK( cudaEventRecord(stop) );
CUDA_CHECK( cudaEventSynchronize(stop) );
CHECK_ERROR_MSG("Default stream test failed");
CUDA_CHECK( cudaEventElapsedTime(gputime, start, stop) );
CUDA_CHECK( cudaEventDestroy(start) );
CUDA_CHECK( cudaEventDestroy(stop) );
}
// Create the streams and compute the decomposition
void create_streams(int nstreams, int vecsize, stream **strm)
{
*strm = new stream[nstreams];
stream *s = *strm;
for(int i = 0; i < nstreams; i++) {
CUDA_CHECK( cudaStreamCreate(&s[i].strm) );
}
s[0].start = 0;
s[0].len = vecsize / nstreams;
s[0].len += vecsize % nstreams ? 1 : 0;
for(int i = 1; i < nstreams; i++) {
int offset = vecsize / nstreams;
if(i < vecsize % nstreams) {
offset++;
}
s[i].len = offset;
s[i].start = s[i-1].start + offset;
}
}
// Delete the streams
void destroy_streams(int nstreams, stream *s)
{
for(int i = 0; i < nstreams; i++) {
CUDA_CHECK( cudaStreamDestroy(s[i].strm) );
}
delete[] s;
}
int main(int argc, char *argv[])
{
const int ThreadsInBlock = 512;
int iterations;
double *dA, *dB, *dC;
double *hA, *hB, *hC;
double ref_value;
float gputime_ref, gputimes[N_TESTS];
stream *s;
cudaDeviceProp prop;
if (argc < 2) {
printf("Usage: %s N\nwhere N is the length of the vector.\n",
argv[0]);
exit(EXIT_FAILURE);
}
int N = atoi(argv[1]);
// Determine the number of available multiprocessors on the device.
// It is used for a coarse adjustment of the computation part of
// this test.
cudaGetDeviceProperties(&prop, 0);
iterations = (prop.multiProcessorCount + 1) / 2;
// Add here the host memory allocation routines (page-locked)
CUDA_CHECK( cudaMallocHost((void**)&hA, sizeof(double) * N) );
CUDA_CHECK( cudaMallocHost((void**)&hB, sizeof(double) * N) );
CUDA_CHECK( cudaMallocHost((void**)&hC, sizeof(double) * N) );
for(int i = 0; i < N; ++i) {
hA[i] = 1.0;
hB[i] = 2.0;
}
ref_value = 3.0 * ITER_MULTIPLIER * iterations;
CUDA_CHECK( cudaMalloc((void**)&dA, sizeof(double) * N) );
CUDA_CHECK( cudaMalloc((void**)&dB, sizeof(double) * N) );
CUDA_CHECK( cudaMalloc((void**)&dC, sizeof(double) * N) );
// Check the timings of default stream first
default_stream(hC, hA, hB, dC, dA, dB, N, &gputime_ref, ThreadsInBlock,
iterations);
// Here we loop over the test. On each iteration, we double the number
// of streams.
for(int strm = 0; strm < N_TESTS; strm++) {
int stream_count = 1<<strm;
create_streams(stream_count, N, &s);
streamtest(hC, hA, hB, dC, dA, dB, s, stream_count, &gputimes[strm],
ThreadsInBlock, iterations);
destroy_streams(stream_count, s);
}
CUDA_CHECK( cudaFree((void*)dA) );
CUDA_CHECK( cudaFree((void*)dB) );
CUDA_CHECK( cudaFree((void*)dC) );
int errorsum = 0;
for (int i = 0; i < N; i++) {
errorsum += hC[i] - ref_value;
}
printf("Error sum = %i\n", errorsum);
printf("Time elapsed for reference run: %f\n", gputime_ref / 1000.);
for(int i = 0; i < N_TESTS; i++) {
printf("Time elapsed for test %2i: %f\n", 1<<i,
gputimes[i] / 1000.);
}
// Add here the correct host memory freeing routines
CUDA_CHECK( cudaFreeHost((void*)hA) );
CUDA_CHECK( cudaFreeHost((void*)hB) );
CUDA_CHECK( cudaFreeHost((void*)hC) );
return 0;
}
#include <sys/time.h>
#include <cstdio>
#include "jacobi.h"
#include "error_checks.h"
// Change this to 0 if CPU reference result is not needed
#define COMPUTE_CPU_REFERENCE 1
#define MAX_ITERATIONS 3000
// CPU kernel
void sweepCPU(double* phi, const double *phiPrev, const double *source,
double h2, int N)
{
int i, j;
int index, i1, i2, i3, i4;
for (j = 1; j < N-1; j++) {
for (i = 1; i < N-1; i++) {
index = i + j*N;
i1 = (i-1) + j * N;
i2 = (i+1) + j * N;
i3 = i + (j-1) * N;
i4 = i + (j+1) * N;
phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] +
phiPrev[i3] + phiPrev[i4] -
h2 * source[index]);
}
}
}
// GPU kernel
__global__
void sweepGPU(double *phi, const double *phiPrev, const double *source,
double h2, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int index = i + j*N;
int i1, i2, i3, i4;
i1 = (i-1) + j * N;
i2 = (i+1) + j * N;
i3 = i + (j-1) * N;
i4 = i + (j+1) * N;
if (i > 0 && j > 0 && i < N-1 && j < N-1)
phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] +
phiPrev[i3] + phiPrev[i4] -
h2 * source[index]);
}
double compareArrays(const double *a, const double *b, int N)
{
double error = 0.0;
int i;
for (i = 0; i < N*N; i++) {
error += fabs(a[i] - b[i]);
}
return error/(N*N);
}
double diffCPU(const double *phi, const double *phiPrev, int N)
{
int i;
double sum = 0;
double diffsum = 0;
for (i = 0; i < N*N; i++) {
diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]);
sum += phi[i] * phi[i];
}
return sqrt(diffsum/sum);
}
int main()
{
timeval t1, t2; // Structs for timing
const int N = 512;
double h = 1.0 / (N - 1);
int iterations;
const double tolerance = 5e-4; // Stopping condition
int i, j, index;
const int blocksize = 16;
double *phi = new double[N*N];
double *phiPrev = new double[N*N];
double *source = new double[N*N];
double *phi_cuda = new double[N*N];
double *phi_d, *phiPrev_d, *source_d;
// Size of the arrays in bytes
const int size = N*N*sizeof(double);
double diff;
// Source initialization
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
double x, y;
x = (i - N / 2) * h;
y = (j - N / 2) * h;
index = j + i * N;
if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1)
source[index] = 1e10*h*h;
else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1)
source[index] = -1e10*h*h;
else
source[index] = 0.0;
}
}
CUDA_CHECK( cudaMalloc( (void**)&source_d, size) );
CUDA_CHECK( cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice) );
// Reset values to zero
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
index = j + i * N;
phi[index] = 0.0;
phiPrev[index] = 0.0;
}
}
CUDA_CHECK( cudaMalloc( (void**)&phi_d, size) );
CUDA_CHECK( cudaMalloc( (void**)&phiPrev_d, size) );
CUDA_CHECK( cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice) );
CUDA_CHECK( cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice) );
// CPU version
if(COMPUTE_CPU_REFERENCE) {
gettimeofday(&t1, NULL);
// Do sweeps untill difference is under the tolerance
diff = tolerance * 2;
iterations = 0;
while (diff > tolerance && iterations < MAX_ITERATIONS) {
sweepCPU(phiPrev, phi, source, h * h, N);
sweepCPU(phi, phiPrev, source, h * h, N);
iterations += 2;
if (iterations % 100 == 0) {
diff = diffCPU(phi, phiPrev, N);
printf("%d %g\n", iterations, diff);
}
}
gettimeofday(&t2, NULL);
printf("CPU Jacobi: %g seconds, %d iterations\n",
t2.tv_sec - t1.tv_sec +
(t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
}
// GPU version
dim3 dimBlock(blocksize, blocksize);
dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize);
//do sweeps until diff under tolerance
diff = tolerance * 2;
iterations = 0;
gettimeofday(&t1, NULL);
while (diff > tolerance && iterations < MAX_ITERATIONS) {
// See above how the CPU update kernel is called
// and implement similar calling sequence for the GPU code
//// Add routines here
sweepGPU<<<dimGrid, dimBlock>>>(phiPrev_d, phi_d, source_d, h*h, N);
sweepGPU<<<dimGrid, dimBlock>>>(phi_d, phiPrev_d, source_d, h*h, N);
CHECK_ERROR_MSG("Jacobi kernels");
iterations += 2;
if (iterations % 100 == 0) {
// diffGPU is defined in the header file, it uses
// Thrust library for reduction computation
diff = diffGPU<double>(phiPrev_d, phi_d, N);
CHECK_ERROR_MSG("Difference computation");
printf("%d %g\n", iterations, diff);
}
}
CUDA_CHECK( cudaMemcpy(phi_cuda, phi_d, size, cudaMemcpyDeviceToHost) );
gettimeofday(&t2, NULL);
printf("GPU Jacobi: %g seconds, %d iterations\n",
t2.tv_sec - t1.tv_sec +
(t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
//// Add here the clean up code for all allocated CUDA resources
CUDA_CHECK( cudaFree(phi_d) );
CUDA_CHECK( cudaFree(phiPrev_d) );
CUDA_CHECK( cudaFree(source_d) );
if (COMPUTE_CPU_REFERENCE) {
printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N));
}
delete[] phi;
delete[] phi_cuda;
delete[] phiPrev;
delete[] source;
return EXIT_SUCCESS;
}
…