getting wrong values in matrix multiplication
this is my program
#include <stdio> #include <cuda> #include <time> #include <conio> #define N 200 #define TILE_WIDTH 20 __global__ void MatMul(int*A, int* B, int* C) { int sum; int idx = threadIdx.x; int idy = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; int k ,uidx , uidy , i; uidx = bx*TILE_WIDTH + idx; uidy = by*TILE_WIDTH + idy; sum = 0; // Allocating memory in shared memory __shared__ int temp1[TILE_WIDTH][TILE_WIDTH]; __shared__ int temp2[TILE_WIDTH][TILE_WIDTH]; //copying the data to shared memory for( i =0;i<N/TILE_WIDTH; i++) { temp1[idy][idx]= A[TILE_WIDTH*(by*N+i) + idx+idy*N]; temp2[idy][idx]= B[TILE_WIDTH*(bx+N*i) + idx+idy*N]; __syncthreads(); // multiplying matrices in shared memory for(k=0 ; k < TILE_WIDTH;k++) { sum = sum + temp1[idy][k]*temp2[k][idx]; } } // synchronizing the threads __syncthreads(); C[uidy*N + uidx] = sum; } int main( void ) { int a[N][N], b[N][N], c[N][N]; //host copies of a,b,c int *dev_a, *dev_b, *dev_c; //device copies of a,b,c // allocate the memory on the GPU cudaMalloc( (void**)&dev_a, N * N * sizeof(int) ); cudaMalloc( (void**)&dev_b, N * N * sizeof(int) ); cudaMalloc( (void**)&dev_c, N * N * sizeof(int) ); // fill the matrices 'a' and 'b' on the CPU for (int i=0; i<N; i++) { for (int j=0; j < N; j++) { a[i][j] = j+3; b[i][j] = i+6; } } //copy above a,b values to device cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice ); cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice ); // Prepare timer cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); //start record cudaEventRecord(start, 0); // Kernel invocation with N threads dim3 dimGrid(10,10,1); dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1); MatMul<<<dimGrid>>> (dev_a, dev_b, dev_c); //stop record cudaEventRecord(stop, 0); cudaEventSynchronize(stop); //this is operation time cudaEventElapsedTime(&time, start, stop); //clean up cudaEventDestroy(start); cudaEventDestroy(stop); //copy result to host cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost ); //output.. for (int i=0; i < N; i++){ for (int j=0; j < N; j++){ printf( "%d ", c[i][j]); } } //free the allocated memory in device cudaFree( dev_a ); cudaFree( dev_b ); cudaFree( dev_c ); printf("\n multiplication done!!!\n"); printf("\n"); printf(" time elapsed in ms=%f\n",time); getch(); return 0; } i am getting a matrix of value 2829400 i checked in matlab the value should be a matrix of value 2871200 |
Is this correct
Quote:
|
All times are GMT -5. The time now is 07:34 AM. |