HIP Coding Practice
-
Compiling a program for HIP
-
For example, to compile MyProg.cu you would use a command like
-
nvcc -o MyProg MyProg.cu
-
Code Array Addition Device GPU
#include "hip/hip_runtime.h"
//
// nvcc 03_array_addition_deviceFunc.cu
//
#include <math.h>
#include <stdio.h>
const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ array_addition1(const double *vecA, const double *vecB, double *vecC, const int NX);
void __global__ array_addition2(const double *vecA, const double *vecB, double *vecC, const int NX);
void __global__ array_addition3(const double *vecA, const double *vecB, double *vecC, const int NX);
void array_check(const double *vecC, int NX);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
const int NX = 25600004;
int size_array = sizeof(double) * NX;
double *h_vecA = (double *)malloc(size_array);
double *h_vecB = (double *)malloc(size_array);
double *h_vecC = (double *)malloc(size_array);
for (int i = 0; i < NX; i++)
{
h_vecA[i] = a;
h_vecB[i] = b;
}
double *d_vecA, *d_vecB, *d_vecC;
hipMalloc((void **)&d_vecA, size_array);
hipMalloc((void **)&d_vecB, size_array);
hipMalloc((void **)&d_vecC, size_array);
hipMemcpy(d_vecA, h_vecA, size_array, hipMemcpyHostToDevice);
hipMemcpy(d_vecB, h_vecB, size_array, hipMemcpyHostToDevice);
const int block_size = 128;
int grid_size = (NX + block_size - 1) / block_size;
// defining three kernel functions for array addition in GPU
hipLaunchKernelGGL(array_addition1, grid_size, block_size, 0, 0, d_vecA, d_vecB, d_vecC, NX);
hipMemcpy(h_vecC, d_vecC, size_array, hipMemcpyDeviceToHost);
array_check(h_vecC, NX);
hipLaunchKernelGGL(array_addition2, grid_size, block_size, 0, 0, d_vecA, d_vecB, d_vecC, NX);
hipMemcpy(h_vecC, d_vecC, size_array, hipMemcpyDeviceToHost);
array_check(h_vecC, NX);
hipLaunchKernelGGL(array_addition3, grid_size, block_size, 0, 0, d_vecA, d_vecB, d_vecC, NX);
hipMemcpy(h_vecC, d_vecC, size_array, hipMemcpyDeviceToHost);
array_check(h_vecC, NX);
free(h_vecA);
free(h_vecB);
free(h_vecC);
hipFree(d_vecA);
hipFree(d_vecB);
hipFree(d_vecC);
printf("\n--Ending of the main function.\n\n");
return 0;
}
double __device__ array_addition1_device(const double aa, const double bb)
{
return (aa + bb);
}
void __global__ array_addition1(const double *vecA, const double *vecB, double *vecC, const int NX)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < NX)
vecC[i] = array_addition1_device(vecA[i], vecB[i]); // vecC[i] = vecA[i] + vecB[i];
}
void __device__ array_addition2_device(const double vecA, const double vecB, double *vecC)
{
*vecC = vecA + vecB;
}
void __global__ array_addition2(const double *vecA, const double *vecB, double *vecC, const int NX)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < NX)
array_addition2_device(vecA[i], vecB[i], &vecC[i]); // vecC[i] = vecA[i] + vecB[i];
}
void __device__ array_addition3_device(const double vecA, const double vecB, double &vecC)
{
vecC = vecA + vecB;
}
void __global__ array_addition3(const double *vecA, const double *vecB, double *vecC, const int NX)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < NX)
array_addition3_device(vecA[i], vecB[i], vecC[i]); // vecC[i] = vecA[i] + vecB[i];
}
void array_check(const double *vecC, const int NX)
{
bool has_error = false;
for (int i = 0; i < NX; i++)
{
if (fabs(vecC[i] - c) > EPSILON)
{
has_error = true;
break;
}
}
printf("\n\tChecking array addition results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
ADD SOME RESULTS
-
Matrix SummationExamples in GPU
Code GPU Grid Info
#include "hip/hip_runtime.h"
//
// nvcc 01_GPU_grid_block_thread_info.cu
//
#include <hip/hip_runtime.h>
#include <stdio.h>
#include "error_checker.h"
void __global__ check_grid_block_thread_info_GPU(void);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n\n");
printf("\t***************************************************\n");
printf("\t********** Output for num_element = 1024 **********\n");
printf("\t***************************************************\n\n");
int num_elements = 1024;
printf("\t\tThere are %d data, which can be distributed:\n", num_elements);
// define grid and block structure
dim3 block(1024); // == (block.x = 1024; block.y = 1; block.z = 1;)
dim3 grid((num_elements + block.x - 1) / block.x);
printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);
// reset block
block.x = 512;
grid.x = (num_elements + block.x - 1) / block.x;
printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);
// reset block
block.x = 256;
grid.x = (num_elements + block.x - 1) / block.x;
printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);
// reset block
block.x = 128;
grid.x = (num_elements + block.x - 1) / block.x;
printf("\t\t- grid.x=%d, block.x=%d\n\n", grid.x, block.x);
CHECK(hipDeviceSynchronize());
printf("\t***************************************************\n");
printf("\t*********** Output for num_element = 16 ***********\n");
printf("\t***************************************************\n\n");
// reset the total number of data element
num_elements = 16;
// reset grid and block structure
block.x = 2;
grid.x = (num_elements + block.x - 1) / block.x;
// check grid and block info from host side
printf("\t\t- CPU output -- grid.x=%d, grid.y=%d, grid.z=%d\n", grid.x, grid.y, grid.z);
printf("\t\t- CPU output -- block.x=%d, block.y=%d, block.z=%d\n", block.x, block.y, block.z);
putchar('\n');
hipLaunchKernelGGL(check_grid_block_thread_info_GPU, grid, block, 0, 0);
CHECK(hipDeviceReset());
printf("\n--Ending of the main function.\n\n");
return 0;
}
void __global__ check_grid_block_thread_info_GPU(void)
{
int gdx = gridDim.x;
int gdy = gridDim.y;
int gdz = gridDim.z;
int bdx = blockDim.x;
int bdy = blockDim.y;
int bdz = blockDim.z;
int bx = blockIdx.x;
int by = blockIdx.y;
int bz = blockIdx.z;
int tx = threadIdx.x;
int ty = threadIdx.y;
int tz = threadIdx.z;
printf("\t\t- GPU output -- gridDim=(%d, %d, %d) blockDim=(%d, %d, %d) blockIdx=(%d, %d, %d) threadIdx=(%d, %d, %d)\n",
gdx, gdy, gdz, bdx, bdy, bdz, bx, by, bz, tx, ty, tz);
}
Code Matrix Thread Index Info
#include "hip/hip_runtime.h"
//
// nvcc 02_matrix_thread_index_info.cu
//
#include <cstdio>
#include <hip/hip_runtime.h>
#include "error_checker.h"
void initialInt(int *matrix, int nxy);
void printMatrix(int *h_matrixA, const int nx, const int ny);
void __global__ printGPUIdx(int *d_matrixA, const int nx, const int ny);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
int dev = 0;
hipDeviceProp_t deviceProp;
hipGetDeviceProperties(&deviceProp, dev);
printf("\nUsing Device %d: %s\n", dev, deviceProp.name); // Using Device 0: NVIDIA GeForce GT 1030
int nx = 8;
int ny = 6;
int nxy = nx * ny;
int size_matrix = nxy*(sizeof(int));
// malloc host mem
int *h_matrixA;
h_matrixA = (int *)malloc(size_matrix);
initialInt(h_matrixA, nxy);
printMatrix(h_matrixA, nx, ny);
//malloc device mem
int *d_matrixA;
hipMalloc((void **)&d_matrixA, size_matrix); //
hipMemcpy(d_matrixA, h_matrixA, size_matrix, hipMemcpyHostToDevice); // copy data from CPU to GPU
// setup excution configuration
dim3 block(4, 2);
dim3 grid((nx + block.x-1)/block.x, (ny + block.y-1)/block.y);
printf("\ngrid info >>> grid.x=%d grid.y=%d grid.z=%d.\n", grid.x, grid.y, grid.z);
printf("block info >>> block.x=%d block.y=%d block.z=%d.\n\n", block.x, block.y, block.z);
//invoke kernel
hipLaunchKernelGGL(printGPUIdx, grid, block, 0, 0, d_matrixA, nx, ny);
hipDeviceSynchronize();
printf("\n");
//free host and device
free(h_matrixA);
hipFree(d_matrixA);
//reset device
CHECK(hipDeviceReset());
printf("\n--Ending of the main function.\n\n");
return 0;
}
void initialInt(int *matrix, int nxy){
for(int i = 0; i < nxy; i++)
matrix[i] = i;
}
void printMatrix(int *h_matrixA, const int nx, const int ny){
int *ic = h_matrixA;
printf("\nMatrix:%d, %d\n", nx, ny);
for(int iy = 0; iy < ny; iy++){
for(int ix=0; ix < nx; ix++)
printf("%3d ",ic[ix]);
ic += nx;
printf("\n");
}
}
void __global__ printGPUIdx(int *d_matrixA, const int nx, const int ny){
int ix = threadIdx.x + blockIdx.x*blockDim.x;
int iy = threadIdx.y + blockIdx.y*blockDim.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
unsigned int idx = iy*nx + ix;
printf("block_id (%d %d) thread_id (%d,%d) coordinate (%d %d) global_index (%d) value (%d)\n",bx, by, tx, ty, ix, iy, idx, d_matrixA[idx]);
}
Code Matrix Summation
#include "hip/hip_runtime.h"
//
// nvcc 03_matrix_summation_GPU_2D2D_2D1D_1D1D.cu
//
#include <hip/hip_runtime.h>
#include <stdio.h>
#include "error_checker.h"
const double EPSILON = 1.0E-8;
void matrix_initialization(float *ip, const int size);
void matrix_summation_on_CPU(float *A, float *B, float *C, const int, const int);
void check_results_from_CPU_GPU(float *fromCPU, float *fromGPU, const int);
void __global__ matrix_summation_on_GPU_1D1D(float *A, float *B, float *C, int, int);
void __global__ matrix_summation_on_GPU_2D1D(float *A, float *B, float *C, int, int);
void __global__ matrix_summation_on_GPU_2D2D(float *A, float *B, float *C, int, int);
int main(int argc, const char * argv[])
{
printf("\n--Beginning of the main function.\n");
// set up device
int dev = 0;
hipDeviceProp_t deviceProp;
CHECK(hipGetDeviceProperties(&deviceProp, dev));
printf("\n\tUsing Device %d: %s\n", dev, deviceProp.name);
CHECK(hipSetDevice(dev));
// set up data size of matrix
int nx = 1 << 10;
int ny = 1 << 10;
int nxy = nx * ny;
int size_matrix = nxy * sizeof(float);
printf("\n\tMatrix size: nx=%d ny=%d\n", nx, ny);
// malloc host memory
float *h_matrixA, *h_matrixB, *h_matrixSumFromCPU, *h_matrixSumFromGPU;
h_matrixA = (float *)malloc(size_matrix);
h_matrixB = (float *)malloc(size_matrix);
h_matrixSumFromCPU = (float *)malloc(size_matrix);
h_matrixSumFromGPU = (float *)malloc(size_matrix);
// initialize data at host side and define a timer
hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start);
hipEventQuery(start);
matrix_initialization(h_matrixA, nxy);
matrix_initialization(h_matrixB, nxy);
hipEventRecord(stop);
hipEventSynchronize(stop);
float elapsed_time;
hipEventElapsedTime(&elapsed_time, start, stop); //CHECK();
printf("\tMatrix initialization on host(CPU) elapsed %f sec\n", elapsed_time);
memset(h_matrixSumFromCPU, 0, size_matrix);
memset(h_matrixSumFromGPU, 0, size_matrix);
// summation of matrix elements at host(CPU) side
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start);
hipEventQuery(start);
matrix_summation_on_CPU(h_matrixA, h_matrixB, h_matrixSumFromCPU, nx, ny);
hipEventRecord(stop);
hipEventSynchronize(stop);
hipEventElapsedTime(&elapsed_time, start, stop);
printf("\tMatrix summation on host(CPU) elapsed %f sec\n", elapsed_time);
// malloc device global memory
float *d_matrixA, *d_matrixB, *d_matrixC;
CHECK(hipMalloc((void **)&d_matrixA, size_matrix));
CHECK(hipMalloc((void **)&d_matrixB, size_matrix));
CHECK(hipMalloc((void **)&d_matrixC, size_matrix));
// transfer data from host to device
CHECK(hipMemcpy(d_matrixA, h_matrixA, size_matrix, hipMemcpyHostToDevice));
CHECK(hipMemcpy(d_matrixB, h_matrixB, size_matrix, hipMemcpyHostToDevice));
//---------------
// invoke kernel at host side for summation on GPU using 2D_grid and 2D_block
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy); // (32, 32, 1)
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); //(32, 32, 1)
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start);
hipEventQuery(start);
hipLaunchKernelGGL(matrix_summation_on_GPU_2D2D, grid, block, 0, 0, d_matrixA, d_matrixB, d_matrixC, nx, ny);
hipDeviceSynchronize();
hipEventRecord(stop);
hipEventSynchronize(stop);
hipEventElapsedTime(&elapsed_time, start, stop);
printf("\n\tMatrix summation on GPU (2D_grid 2D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n",
grid.x, grid.y, block.x, block.y, elapsed_time);
hipGetLastError(); // check kernel error
// copy kernel result back to host side
hipMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, hipMemcpyDeviceToHost);
// comparison of computation results
check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------
// invoke kernel at host side for summation on GPU using 2D_grid and 1D_block
dimy = 1;
block.y = dimy; // block (32, 1, 1)
grid.x = (nx + block.x - 1) / block.x;
grid.y = ny; // grid (32, 1024, 1)
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start);
hipEventQuery(start);
hipLaunchKernelGGL(matrix_summation_on_GPU_2D1D, grid, block, 0, 0, d_matrixA, d_matrixB, d_matrixC, nx, ny);
hipDeviceSynchronize();
hipEventRecord(stop);
hipEventSynchronize(stop);
hipEventElapsedTime(&elapsed_time, start, stop);
printf("\n\tMatrix summation on GPU (2D_grid 1D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n",
grid.x, grid.y, block.x, block.y, elapsed_time);
hipGetLastError(); // check kernel error
// copy kernel result back to host side
hipMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, hipMemcpyDeviceToHost);
// comparison of computation results
check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------
// invoke kernel at host side for summation on GPU using 1D_grid and 1D_block
dimy = 1;
block.y = dimy; // block (32, 1, 1)
grid.x = (nx + block.x - 1) / block.x;
grid.y = 1; // grid (32, 1, 1)
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start);
hipEventQuery(start);
hipLaunchKernelGGL(matrix_summation_on_GPU_1D1D, grid, block, 0, 0, d_matrixA, d_matrixB, d_matrixC, nx, ny);
hipDeviceSynchronize();
hipEventRecord(stop);
hipEventSynchronize(stop);
hipEventElapsedTime(&elapsed_time, start, stop);
printf("\n\tMatrix summation on GPU (1D_grid 1D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n",
grid.x, grid.y, block.x, block.y, elapsed_time);
hipGetLastError(); // check kernel error
// copy kernel result back to host side
hipMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, hipMemcpyDeviceToHost);
// comparison of computation results
check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------
// destroy start and stop events
CHECK(hipEventDestroy(start));
CHECK(hipEventDestroy(stop));
// free host memory and device global memory
free(h_matrixA);
free(h_matrixB);
free(h_matrixSumFromCPU);
free(h_matrixSumFromGPU);
hipFree(d_matrixA);
hipFree(d_matrixB);
hipFree(d_matrixC);
CHECK(hipDeviceReset());
printf("\n--Ending of the main function.\n\n");
return 0;
}
void matrix_initialization(float *ip, const int size)
{
for(int i = 0; i < size; i++)
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
void matrix_summation_on_CPU(float *matrixA, float *matrixB, float *matrixC,
const int nx, const int ny)
{
float *ia = matrixA;
float *ib = matrixB;
float *ic = matrixC;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
ic[ix] = ia[ix] + ib[ix];
ia += nx;
ib += nx;
ic += nx;
}
}
void __global__ matrix_summation_on_GPU_2D2D(float *matrixA, float *matrixB,
float *matrixC, int nx, int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny)
matrixC[idx] = matrixA[idx] + matrixB[idx];
}
void __global__ matrix_summation_on_GPU_2D1D(float *matrixA, float *matrixB,
float *matrixC, int nx, int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = blockIdx.y;
unsigned int idx = iy * nx + ix;
if (ix < nx && iy < ny)
matrixC[idx] = matrixA[idx] + matrixB[idx];
}
void __global__ matrix_summation_on_GPU_1D1D(float *matrixA, float *matrixB,
float *matrixC, int nx, int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
if (ix < nx)
for (int iy = 0; iy < ny; iy++)
{
int idx = iy * nx + ix;
matrixC[idx] = matrixA[idx] + matrixB[idx];
}
}
void check_results_from_CPU_GPU(float *h_matrixSumFromCPU,
float *h_matrixSumFromGPU, const int N)
{
bool has_error = false;
for (int i = 0; i < N; i++)
{
if (abs(h_matrixSumFromCPU[i] - h_matrixSumFromGPU[i]) > EPSILON)
{
has_error = true;
printf("host %f gpu %f\n", h_matrixSumFromCPU[i], h_matrixSumFromGPU[i]);
break;
}
}
printf("\tChecking matrix summation results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
…