HIP Coding Practice

Img401
  • Compiling a program for HIP

    • For example, to compile MyProg.cu you would use a command like

    • nvcc -o MyProg MyProg.cu

Array Addition Examples in CPU/GPU
  • Array Addition Examples in CPU/GPU

Code Array Addition CPU
//
// gcc  01_array_addition_cpu.cpp
// nvcc 01_array_addition_cpu.cpp
//
#include <math.h>
#include <stdlib.h>
#include <stdio.h>

const double EPSILON = 1.0E-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;

void array_addition(const double *vecA, const double *vecB, double *vecC, const int NX);
void array_check(const double *vecC, const int NX);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n");

    const int NX = 1000000;
	int size_array = sizeof(double) * NX;
    double *vecA = (double *)malloc(size_array);
    double *vecB = (double *)malloc(size_array);
    double *vecC = (double *)malloc(size_array);

    for (int i = 0; i < NX; i++)
    {
        vecA[i] = a;
        vecB[i] = b;
    }

    array_addition(vecA, vecB, vecC, NX);
    array_check(vecC, NX);

    free(vecA);
    free(vecB);
    free(vecC);

	printf("\n--Ending of the main function.\n\n");

    return 0;
}


void array_addition(const double *vecA, const double *vecB, double *vecC, const int NX)
{
    for (int i = 0; i < NX; i++)
        vecC[i] = vecA[i] + vecB[i];
}


void array_check(const double *vecC, const int NX)
{
    bool has_error = false;
    for (int i = 0; i < NX; i++)
    {
        if (fabs(vecC[i] - c) > EPSILON)
		{
            has_error = true;
			break;
		}
    }
    printf("\n\tChecking array addition results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
Code Array Addition GPU
#include "hip/hip_runtime.h"
//
// nvcc 02_array_addition_gpu.cu
//
#include <math.h>
#include <stdio.h>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;

void __global__ array_addition(const double *vecA, const double *vecB, double *vecC);
void array_check(const double *vecC, const int NX);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n");

    const int NX = 25600004; // try 25600000, 25600004 and 25600008
	int size_array = sizeof(double) * NX;

    double *h_vecA = (double *)malloc(size_array); // 'h' for host (CPU)
    double *h_vecB = (double *)malloc(size_array);
    double *h_vecC = (double *)malloc(size_array);

    for (int i = 0; i < NX; i++)
    {
        h_vecA[i] = a;
        h_vecB[i] = b;
    }

    double *d_vecA, *d_vecB, *d_vecC; // 'd' for device (GPU)
    hipMalloc((void **)&d_vecA, size_array);
    hipMalloc((void **)&d_vecB, size_array);
    hipMalloc((void **)&d_vecC, size_array);
    hipMemcpy(d_vecA, h_vecA, size_array, hipMemcpyHostToDevice); // copy data from host(CPU) to device(GPU)
    hipMemcpy(d_vecB, h_vecB, size_array, hipMemcpyHostToDevice);

    const int block_size = 128;
    int grid_size = (NX + block_size - 1) / block_size;

	printf("\n\tArray_size = %d, grid_size = %d and block_size = %d.\n", NX, grid_size, block_size);
    hipLaunchKernelGGL(array_addition, grid_size, block_size, 0, 0, d_vecA, d_vecB, d_vecC);

    hipMemcpy(h_vecC, d_vecC, size_array, hipMemcpyDeviceToHost); // copy data from device(GPU) to host(CPU)
    array_check(h_vecC, NX);

    free(h_vecA);
    free(h_vecB);
    free(h_vecC);
    hipFree(d_vecA);
    hipFree(d_vecB);
    hipFree(d_vecC);

	printf("\n--Ending of the main function.\n\n");

    return 0;
}


void __global__ array_addition(const double *vecA, const double *vecB, double *vecC)
{
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    vecC[i] = vecA[i] + vecB[i];
}


void array_check(const double *vecC, const int NX)
{
    bool has_error = false;
    for (int i = 0; i < NX; i++)
    {
        if (fabs(vecC[i] - c) > EPSILON)
		{
            has_error = true;
			break;
		}
    }
    printf("\n\tChecking array addition results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
Code Array Addition Device GPU
#include "hip/hip_runtime.h"
//
// nvcc 03_array_addition_deviceFunc.cu
//
#include <math.h>
#include <stdio.h>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;

void __global__ array_addition1(const double *vecA, const double *vecB, double *vecC, const int NX);
void __global__ array_addition2(const double *vecA, const double *vecB, double *vecC, const int NX);
void __global__ array_addition3(const double *vecA, const double *vecB, double *vecC, const int NX);
void array_check(const double *vecC, int NX);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n");

    const int NX = 25600004;
	int size_array = sizeof(double) * NX;

    double *h_vecA = (double *)malloc(size_array);
    double *h_vecB = (double *)malloc(size_array);
    double *h_vecC = (double *)malloc(size_array);

    for (int i = 0; i < NX; i++)
    {
        h_vecA[i] = a;
        h_vecB[i] = b;
    }

    double *d_vecA, *d_vecB, *d_vecC;
    hipMalloc((void **)&d_vecA, size_array);
    hipMalloc((void **)&d_vecB, size_array);
    hipMalloc((void **)&d_vecC, size_array);
    hipMemcpy(d_vecA, h_vecA, size_array, hipMemcpyHostToDevice);
    hipMemcpy(d_vecB, h_vecB, size_array, hipMemcpyHostToDevice);

    const int block_size = 128;
    int grid_size = (NX + block_size - 1) / block_size;

	//	defining three kernel functions for array addition in GPU
    hipLaunchKernelGGL(array_addition1, grid_size, block_size, 0, 0, d_vecA, d_vecB, d_vecC, NX);
    hipMemcpy(h_vecC, d_vecC, size_array, hipMemcpyDeviceToHost);
    array_check(h_vecC, NX);

    hipLaunchKernelGGL(array_addition2, grid_size, block_size, 0, 0, d_vecA, d_vecB, d_vecC, NX);
    hipMemcpy(h_vecC, d_vecC, size_array, hipMemcpyDeviceToHost);
    array_check(h_vecC, NX);

    hipLaunchKernelGGL(array_addition3, grid_size, block_size, 0, 0, d_vecA, d_vecB, d_vecC, NX);
    hipMemcpy(h_vecC, d_vecC, size_array, hipMemcpyDeviceToHost);
    array_check(h_vecC, NX);

    free(h_vecA);
    free(h_vecB);
    free(h_vecC);
    hipFree(d_vecA);
    hipFree(d_vecB);
    hipFree(d_vecC);

	printf("\n--Ending of the main function.\n\n");

    return 0;
}


double __device__ array_addition1_device(const double aa, const double bb)
{
    return (aa + bb);
}
void __global__ array_addition1(const double *vecA, const double *vecB, double *vecC, const int NX)
{
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < NX)
        vecC[i] = array_addition1_device(vecA[i], vecB[i]); // vecC[i] = vecA[i] + vecB[i];
}


void __device__ array_addition2_device(const double vecA, const double vecB, double *vecC)
{
    *vecC = vecA + vecB;
}
void __global__ array_addition2(const double *vecA, const double *vecB, double *vecC, const int NX)
{
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < NX)
        array_addition2_device(vecA[i], vecB[i], &vecC[i]); // vecC[i] = vecA[i] + vecB[i];
}


void __device__ array_addition3_device(const double vecA, const double vecB, double &vecC)
{
    vecC = vecA + vecB;
}
void __global__ array_addition3(const double *vecA, const double *vecB, double *vecC, const int NX)
{
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < NX)
        array_addition3_device(vecA[i], vecB[i], vecC[i]); // vecC[i] = vecA[i] + vecB[i];
}


void array_check(const double *vecC, const int NX)
{
    bool has_error = false;
    for (int i = 0; i < NX; i++)
    {
        if (fabs(vecC[i] - c) > EPSILON)
		{
            has_error = true;
			break;
		}
    }
    printf("\n\tChecking array addition results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}

ADD SOME RESULTS

  • Matrix SummationExamples in GPU

Code GPU Grid Info
#include "hip/hip_runtime.h"
//
// nvcc 01_GPU_grid_block_thread_info.cu
//
#include <hip/hip_runtime.h>
#include <stdio.h>
#include "error_checker.h"

void __global__ check_grid_block_thread_info_GPU(void);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n\n");

	printf("\t***************************************************\n");
	printf("\t********** Output for num_element = 1024 **********\n");
	printf("\t***************************************************\n\n");

    int num_elements = 1024;
	printf("\t\tThere are %d data, which can be distributed:\n", num_elements);

    // define grid and block structure
    dim3 block(1024); // == (block.x = 1024; block.y = 1; block.z = 1;)
    dim3 grid((num_elements + block.x - 1) / block.x);
    printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);

    // reset block
    block.x = 512;
    grid.x  = (num_elements + block.x - 1) / block.x;
    printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);

    // reset block
    block.x = 256;
    grid.x  = (num_elements + block.x - 1) / block.x;
    printf("\t\t- grid.x=%d, block.x=%d\n", grid.x, block.x);

    // reset block
    block.x = 128;
    grid.x  = (num_elements + block.x - 1) / block.x;
    printf("\t\t- grid.x=%d, block.x=%d\n\n", grid.x, block.x);

    CHECK(hipDeviceSynchronize());


	printf("\t***************************************************\n");
	printf("\t*********** Output for num_element = 16 ***********\n");
	printf("\t***************************************************\n\n");

    // reset the total number of data element
    num_elements = 16;

    // reset grid and block structure
    block.x = 2;
	grid.x = (num_elements + block.x - 1) / block.x;

    // check grid and block info from host side
    printf("\t\t- CPU output -- grid.x=%d,  grid.y=%d,  grid.z=%d\n",  grid.x,  grid.y,  grid.z);
    printf("\t\t- CPU output -- block.x=%d, block.y=%d, block.z=%d\n", block.x, block.y, block.z);
	putchar('\n');

    hipLaunchKernelGGL(check_grid_block_thread_info_GPU, grid, block, 0, 0);

    CHECK(hipDeviceReset());

	printf("\n--Ending of the main function.\n\n");
    return 0;
}


void __global__ check_grid_block_thread_info_GPU(void)
{
    int gdx = gridDim.x;
    int gdy = gridDim.y;
    int gdz = gridDim.z;
    int bdx = blockDim.x;
    int bdy = blockDim.y;
    int bdz = blockDim.z;
    int bx  = blockIdx.x;
    int by  = blockIdx.y;
    int bz  = blockIdx.z;
    int tx  = threadIdx.x;
    int ty  = threadIdx.y;
    int tz  = threadIdx.z;
    printf("\t\t- GPU output -- gridDim=(%d, %d, %d)   blockDim=(%d, %d, %d)  blockIdx=(%d, %d, %d)  threadIdx=(%d, %d, %d)\n", 
		gdx, gdy, gdz,  bdx, bdy, bdz, bx, by, bz, tx, ty, tz);
}
Code Matrix Thread Index Info
#include "hip/hip_runtime.h"
//
// nvcc 02_matrix_thread_index_info.cu
//
#include <cstdio>
#include <hip/hip_runtime.h>
#include "error_checker.h"

void initialInt(int *matrix, int nxy);
void printMatrix(int *h_matrixA, const int nx, const int ny);
void __global__ printGPUIdx(int *d_matrixA, const int nx, const int ny);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n");

    int dev = 0;
    hipDeviceProp_t deviceProp;
    hipGetDeviceProperties(&deviceProp, dev);
    printf("\nUsing Device %d: %s\n", dev, deviceProp.name); // Using Device 0: NVIDIA GeForce GT 1030

    int nx = 8;
    int ny = 6;
    int nxy = nx * ny;
	int size_matrix = nxy*(sizeof(int));

    // malloc host mem
    int *h_matrixA;
    h_matrixA = (int *)malloc(size_matrix);
    initialInt(h_matrixA, nxy);
    printMatrix(h_matrixA, nx, ny);

    //malloc device mem
    int *d_matrixA;
    hipMalloc((void **)&d_matrixA, size_matrix); // 
    hipMemcpy(d_matrixA, h_matrixA, size_matrix, hipMemcpyHostToDevice); // copy data from CPU to GPU

    // setup excution configuration
    dim3 block(4, 2);
    dim3 grid((nx + block.x-1)/block.x, (ny + block.y-1)/block.y);
	printf("\ngrid info  >>> grid.x=%d   grid.y=%d   grid.z=%d.\n", grid.x, grid.y, grid.z);
	printf("block info >>> block.x=%d  block.y=%d  block.z=%d.\n\n", block.x, block.y, block.z);

    //invoke kernel
    hipLaunchKernelGGL(printGPUIdx, grid, block, 0, 0, d_matrixA, nx, ny);
    hipDeviceSynchronize();
	printf("\n");

    //free host and device
    free(h_matrixA);
    hipFree(d_matrixA);

    //reset device
    CHECK(hipDeviceReset());

	printf("\n--Ending of the main function.\n\n");

    return 0;
}


void initialInt(int *matrix, int nxy){
    for(int i = 0; i < nxy; i++)
        matrix[i] = i;
}


void printMatrix(int *h_matrixA, const int nx, const int ny){
    int *ic = h_matrixA;
    printf("\nMatrix:%d, %d\n", nx, ny);
    for(int iy = 0; iy < ny; iy++){
        for(int ix=0; ix < nx; ix++)
            printf("%3d ",ic[ix]);
        ic += nx;
        printf("\n");
    }
}


void __global__ printGPUIdx(int *d_matrixA, const int nx, const int ny){
    int ix = threadIdx.x + blockIdx.x*blockDim.x;
    int iy = threadIdx.y + blockIdx.y*blockDim.y;
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    unsigned int idx = iy*nx + ix;
    printf("block_id (%d %d) thread_id (%d,%d) coordinate (%d %d) global_index (%d) value (%d)\n",bx, by, tx, ty, ix, iy, idx, d_matrixA[idx]);
}
Code Matrix Summation
#include "hip/hip_runtime.h"
// 
// nvcc 03_matrix_summation_GPU_2D2D_2D1D_1D1D.cu
// 
#include <hip/hip_runtime.h>
#include <stdio.h>
#include "error_checker.h"

const double EPSILON = 1.0E-8;

void matrix_initialization(float *ip, const int size);
void matrix_summation_on_CPU(float *A, float *B, float *C, const int, const int);
void check_results_from_CPU_GPU(float *fromCPU, float *fromGPU, const int);
void __global__ matrix_summation_on_GPU_1D1D(float *A, float *B, float *C, int, int);
void __global__ matrix_summation_on_GPU_2D1D(float *A, float *B, float *C, int, int);
void __global__ matrix_summation_on_GPU_2D2D(float *A, float *B, float *C, int, int);


int main(int argc, const char * argv[])
{

	printf("\n--Beginning of the main function.\n");

    // set up device
    int dev = 0;
    hipDeviceProp_t deviceProp;
    CHECK(hipGetDeviceProperties(&deviceProp, dev));
    printf("\n\tUsing Device %d: %s\n", dev, deviceProp.name);
    CHECK(hipSetDevice(dev));

    // set up data size of matrix
    int nx = 1 << 10;
    int ny = 1 << 10;

    int nxy = nx * ny;
    int size_matrix = nxy * sizeof(float);
    printf("\n\tMatrix size: nx=%d ny=%d\n", nx, ny);

    // malloc host memory
    float *h_matrixA, *h_matrixB, *h_matrixSumFromCPU, *h_matrixSumFromGPU;
    h_matrixA = (float *)malloc(size_matrix);
    h_matrixB = (float *)malloc(size_matrix);
    h_matrixSumFromCPU = (float *)malloc(size_matrix);
    h_matrixSumFromGPU = (float *)malloc(size_matrix);

    // initialize data at host side and define a timer
    hipEvent_t start, stop;
    hipEventCreate(&start);
    hipEventCreate(&stop);
    hipEventRecord(start);
    hipEventQuery(start);
    matrix_initialization(h_matrixA, nxy);
    matrix_initialization(h_matrixB, nxy);
    hipEventRecord(stop);
    hipEventSynchronize(stop);
    float elapsed_time;
    hipEventElapsedTime(&elapsed_time, start, stop); //CHECK();
    printf("\tMatrix initialization on host(CPU) elapsed %f sec\n", elapsed_time);

    memset(h_matrixSumFromCPU, 0, size_matrix);
    memset(h_matrixSumFromGPU, 0, size_matrix);

    // summation of matrix elements at host(CPU) side
    hipEventCreate(&start);
    hipEventCreate(&stop);
    hipEventRecord(start);
    hipEventQuery(start);
    matrix_summation_on_CPU(h_matrixA, h_matrixB, h_matrixSumFromCPU, nx, ny);
    hipEventRecord(stop);
    hipEventSynchronize(stop);
    hipEventElapsedTime(&elapsed_time, start, stop);
    printf("\tMatrix summation on host(CPU) elapsed %f sec\n", elapsed_time);


    // malloc device global memory
    float *d_matrixA, *d_matrixB, *d_matrixC;
    CHECK(hipMalloc((void **)&d_matrixA, size_matrix));
    CHECK(hipMalloc((void **)&d_matrixB, size_matrix));
    CHECK(hipMalloc((void **)&d_matrixC, size_matrix));

    // transfer data from host to device
    CHECK(hipMemcpy(d_matrixA, h_matrixA, size_matrix, hipMemcpyHostToDevice));
    CHECK(hipMemcpy(d_matrixB, h_matrixB, size_matrix, hipMemcpyHostToDevice));

//---------------
    // invoke kernel at host side for summation on GPU using 2D_grid and 2D_block
    int dimx = 32;
    int dimy = 32;
    dim3 block(dimx, dimy); // (32, 32, 1)
    dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); //(32, 32, 1)

    hipEventCreate(&start);
    hipEventCreate(&stop);
    hipEventRecord(start);
    hipEventQuery(start);
    hipLaunchKernelGGL(matrix_summation_on_GPU_2D2D, grid, block, 0, 0, d_matrixA, d_matrixB, d_matrixC, nx, ny);
    hipDeviceSynchronize();
    hipEventRecord(stop);
    hipEventSynchronize(stop);
    hipEventElapsedTime(&elapsed_time, start, stop);
    printf("\n\tMatrix summation on GPU (2D_grid 2D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n", 
           grid.x, grid.y, block.x, block.y, elapsed_time);
    hipGetLastError(); // check kernel error

    // copy kernel result back to host side
    hipMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, hipMemcpyDeviceToHost);

    // comparison of computation results
    check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------

    // invoke kernel at host side for summation on GPU using 2D_grid and 1D_block
    dimy = 1;
	block.y = dimy; // block (32, 1, 1)
	grid.x = (nx + block.x - 1) / block.x;
	grid.y = ny; // grid (32, 1024, 1)

    hipEventCreate(&start);
    hipEventCreate(&stop);
    hipEventRecord(start);
    hipEventQuery(start);
    hipLaunchKernelGGL(matrix_summation_on_GPU_2D1D, grid, block, 0, 0, d_matrixA, d_matrixB, d_matrixC, nx, ny);
    hipDeviceSynchronize();
    hipEventRecord(stop);
    hipEventSynchronize(stop);
    hipEventElapsedTime(&elapsed_time, start, stop);
    printf("\n\tMatrix summation on GPU (2D_grid 1D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n", 
           grid.x, grid.y, block.x, block.y, elapsed_time);
    hipGetLastError(); // check kernel error

    // copy kernel result back to host side
    hipMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, hipMemcpyDeviceToHost);

    // comparison of computation results
    check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------

    // invoke kernel at host side for summation on GPU using 1D_grid and 1D_block
    dimy = 1;
	block.y = dimy; // block (32, 1, 1)
	grid.x = (nx + block.x - 1) / block.x;
	grid.y = 1; // grid (32, 1, 1)

    hipEventCreate(&start);
    hipEventCreate(&stop);
    hipEventRecord(start);
    hipEventQuery(start);
    hipLaunchKernelGGL(matrix_summation_on_GPU_1D1D, grid, block, 0, 0, d_matrixA, d_matrixB, d_matrixC, nx, ny);
    hipDeviceSynchronize();
    hipEventRecord(stop);
    hipEventSynchronize(stop);
    hipEventElapsedTime(&elapsed_time, start, stop);
    printf("\n\tMatrix summation on GPU (1D_grid 1D_block) <<<(%d,%d), (%d,%d) >>> elapsed %f sec\n", 
           grid.x, grid.y, block.x, block.y, elapsed_time);
    hipGetLastError(); // check kernel error

    // copy kernel result back to host side
    hipMemcpy(h_matrixSumFromGPU, d_matrixC, size_matrix, hipMemcpyDeviceToHost);

    // comparison of computation results
    check_results_from_CPU_GPU(h_matrixSumFromCPU, h_matrixSumFromGPU, nxy);
//---------------

	// destroy start and stop events
    CHECK(hipEventDestroy(start));
    CHECK(hipEventDestroy(stop));	

    // free host memory and device global memory
    free(h_matrixA);
    free(h_matrixB);
    free(h_matrixSumFromCPU);
    free(h_matrixSumFromGPU);
    hipFree(d_matrixA);
    hipFree(d_matrixB);
    hipFree(d_matrixC);

    CHECK(hipDeviceReset());

	printf("\n--Ending of the main function.\n\n");
    return 0;
}


void matrix_initialization(float *ip, const int size)
{
    for(int i = 0; i < size; i++)
        ip[i] = (float)(rand() & 0xFF) / 10.0f;
}


void matrix_summation_on_CPU(float *matrixA, float *matrixB, float *matrixC,
	const int nx, const int ny)
{
    float *ia = matrixA;
    float *ib = matrixB;
    float *ic = matrixC;

    for (int iy = 0; iy < ny; iy++)
    {
        for (int ix = 0; ix < nx; ix++)
            ic[ix] = ia[ix] + ib[ix];
        ia += nx;
        ib += nx;
        ic += nx;
    }
}


void __global__ matrix_summation_on_GPU_2D2D(float *matrixA, float *matrixB,
	float *matrixC, int nx, int ny)
{
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int idx = iy * nx + ix;
    if (ix < nx && iy < ny)
        matrixC[idx] = matrixA[idx] + matrixB[idx];
}
void __global__ matrix_summation_on_GPU_2D1D(float *matrixA, float *matrixB,
	float *matrixC, int nx, int ny)
{
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = blockIdx.y;
    unsigned int idx = iy * nx + ix;
    if (ix < nx && iy < ny)
        matrixC[idx] = matrixA[idx] + matrixB[idx];
}
void __global__ matrix_summation_on_GPU_1D1D(float *matrixA, float *matrixB,
	float *matrixC, int nx, int ny)
{
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    if (ix < nx)
        for (int iy = 0; iy < ny; iy++)
        {
            int idx = iy * nx + ix;
            matrixC[idx] = matrixA[idx] + matrixB[idx];
        }      
}


void check_results_from_CPU_GPU(float *h_matrixSumFromCPU,
	float *h_matrixSumFromGPU, const int N)
{
    bool has_error = false;
    for (int i = 0; i < N; i++)
    {
        if (abs(h_matrixSumFromCPU[i] - h_matrixSumFromGPU[i]) > EPSILON)
        {
            has_error = true;
            printf("host %f gpu %f\n", h_matrixSumFromCPU[i], h_matrixSumFromGPU[i]);
            break;
        }
    }
    printf("\tChecking matrix summation results >>> %s\n", has_error? "|| ERROR ||":"|| NO ERROR ||");
}
Profiling Performance

ADD SOME RESULTS

…​