LinuxQuestions.org - getting wrong values in matrix multiplication

this is my program

#include <stdio>
#include <cuda>
#include <time>
#include <conio>
#define N 200
#define TILE_WIDTH 20

__global__ void MatMul(int*A, int* B, int* C) {

int sum;
int idx = threadIdx.x;
int idy = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int k ,uidx , uidy , i;
uidx = bx*TILE_WIDTH + idx;
uidy = by*TILE_WIDTH + idy;
sum = 0;

// Allocating memory in shared memory

__shared__ int temp1[TILE_WIDTH][TILE_WIDTH];
__shared__ int temp2[TILE_WIDTH][TILE_WIDTH];

//copying the data to shared memory

for( i =0;i<N/TILE_WIDTH; i++)
{
temp1[idy][idx]= A[TILE_WIDTH*(by*N+i) + idx+idy*N];
temp2[idy][idx]= B[TILE_WIDTH*(bx+N*i) + idx+idy*N];
__syncthreads();

// multiplying matrices in shared memory

for(k=0 ; k < TILE_WIDTH;k++) {
sum = sum + temp1[idy][k]*temp2[k][idx];
}
}

// synchronizing the threads

__syncthreads();
C[uidy*N + uidx] = sum;
}

int main( void ) {

int a[N][N], b[N][N], c[N][N]; //host copies of a,b,c

int *dev_a, *dev_b, *dev_c; //device copies of a,b,c

// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );

// fill the matrices 'a' and 'b' on the CPU

for (int i=0; i<N; i++) {
for (int j=0; j < N; j++) {
a[i][j] = j+3;
b[i][j] = i+6;
}
}

//copy above a,b values to device

cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
// Prepare timer
cudaEvent_t start, stop;
float time;

cudaEventCreate(&start);
cudaEventCreate(&stop);

//start record
cudaEventRecord(start, 0);

// Kernel invocation with N threads
dim3 dimGrid(10,10,1);
dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
MatMul<<<dimGrid>>> (dev_a, dev_b, dev_c);

//stop record
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);

//this is operation time
cudaEventElapsedTime(&time, start, stop);

//clean up
cudaEventDestroy(start);
cudaEventDestroy(stop);

//copy result to host
cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );

//output..
for (int i=0; i < N; i++){
for (int j=0; j < N; j++){

printf( "%d ", c[i][j]);

}
}

//free the allocated memory in device
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
printf("\n multiplication done!!!\n");
printf("\n");
printf(" time elapsed in ms=%f\n",time);
getch();
return 0;
}

i am getting a matrix of value 2829400
i checked in matlab the value should be a matrix of value 2871200