Old 10-31-2011, 02:09 PM   #1
getting wrong values in matrix multiplication

this is my program

#include <stdio>
#include <cuda>
#include <time>
#include <conio>
#define N 200
#define TILE_WIDTH 20

__global__ void MatMul(int*A, int* B, int* C) {

int sum;
int idx = threadIdx.x;
int idy = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int k ,uidx , uidy , i;
uidx = bx*TILE_WIDTH + idx;
uidy = by*TILE_WIDTH + idy;
sum = 0;

// Allocating memory in shared memory

__shared__ int temp1[TILE_WIDTH][TILE_WIDTH];
__shared__ int temp2[TILE_WIDTH][TILE_WIDTH];

//copying the data to shared memory

for( i =0;i<N/TILE_WIDTH; i++)
temp1[idy][idx]= A[TILE_WIDTH*(by*N+i) + idx+idy*N];
temp2[idy][idx]= B[TILE_WIDTH*(bx+N*i) + idx+idy*N];

// multiplying matrices in shared memory

for(k=0 ; k < TILE_WIDTH;k++) {
sum = sum + temp1[idy][k]*temp2[k][idx];

// synchronizing the threads

C[uidy*N + uidx] = sum;

int main( void ) {

int a[N][N], b[N][N], c[N][N]; //host copies of a,b,c

int *dev_a, *dev_b, *dev_c; //device copies of a,b,c

// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );

// fill the matrices 'a' and 'b' on the CPU

for (int i=0; i<N; i++) {
for (int j=0; j < N; j++) {
a[i][j] = j+3;
b[i][j] = i+6;

//copy above a,b values to device

cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
// Prepare timer
cudaEvent_t start, stop;
float time;


//start record
cudaEventRecord(start, 0);

// Kernel invocation with N threads
dim3 dimGrid(10,10,1);
dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
MatMul<<<dimGrid>>> (dev_a, dev_b, dev_c);

//stop record
cudaEventRecord(stop, 0);

//this is operation time
cudaEventElapsedTime(&time, start, stop);

//clean up

//copy result to host
cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );

for (int i=0; i < N; i++){
for (int j=0; j < N; j++){

printf( "%d ", c[i][j]);


//free the allocated memory in device
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
printf("\n multiplication done!!!\n");
printf(" time elapsed in ms=%f\n",time);
return 0;

i am getting a matrix of value 2829400
i checked in matlab the value should be a matrix of value 2871200
Old 10-31-2011, 03:09 PM   #2
Is this correct
temp1[idy][idx]= A[TILE_WIDTH*(by*N+i) + idx+idy*N];
temp2[idy][idx]= B[TILE_WIDTH*(bx+N*i) + idx+idy*N];
