ProgrammingThis forum is for all programming questions.
The question does not have to be directly related to Linux and any language is fair game.
Notices
Welcome to LinuxQuestions.org, a friendly and active Linux Community.
You are currently viewing LQ as a guest. By joining our community you will have the ability to post topics, receive our newsletter, use the advanced search, subscribe to threads and access many other special features. Registration is quick, simple and absolutely free. Join our community today!
Note that registered members see fewer ads, and ContentLink is completely disabled once you log in.
If you have any problems with the registration process or your account login, please contact us. If you need to reset your password, click here.
Having a problem logging in? Please visit this page to clear all LQ-related cookies.
Get a virtual cloud desktop with the Linux distro that you want in less than five minutes with Shells! With over 10 pre-installed distros to choose from, the worry-free installation life is here! Whether you are a digital nomad or just looking for flexibility, Shells can put your Linux machine on the device that you want to use.
Exclusive for LQ members, get up to 45% off per month. Click here for more info.
I got the program shown below from the link shown above.
Code:
//*******************************************************************
// Demo OpenCL application to compute a simple vector addition
// computation between 2 arrays on the GPU
// ******************************************************************
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
// OpenCL source code
const char * OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
// Some interesting data for the vectors
int InitialData1[20]={37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17};
int InitialData2[20]={35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15};
// Number of elements in the vectors to be added
#define SIZE 2048
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
// Two integer source vectors in Host memory
int HostVector1[SIZE], HostVector2[SIZE];
// Initialize with some interesting repeating data
for ( int c = 0; c < SIZE; c++)
{
HostVector1[c] = InitialData1[c%20];
HostVector2[c] = InitialData2[c%20];
}
//Get an OpenCL platform
cl_platform_id cpPlatform;
clGetPlatformIDs (1, &cpPlatform, NULL);
// Get a GPU device
cl_device_id cdDevice;
clGetDeviceIDs (cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_context GPUContext = clCreateContextFromType (0, CL_DEVICE_TYPE_GPU, NULL, NULL,NULL);
// Create a command-queue on the GPU device
cl_command_queue cqCommandQueue = clCreateCommandQueue
(GPUContext, cdDevice, 0, NULL);
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector1, NULL);
cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector2, NULL);
// Allocate output memory on GPU
cl_mem GPUOutputVector = clCreateBuffer (GPUContext, CL_MEM_WRITE_ONLY,
sizeof (int) * SIZE, NULL, NULL);
// Create OpenCL program with source code
cl_program OpenCLProgram = clCreateProgramWithSource (GPUContext, 7,
OpenCLSource, NULL, NULL);
// Build the program (OpenCL JIT compilation)
clBuildProgram (OpenCLProgram, 0, NULL, NULL, NULL, NULL);
// Create a handle to the compiled OpenCL function (Kernel)
cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "VectorAdd", NULL);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&GPUOutputVector);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem),(void*)&GPUVector1);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem),(void*)&GPUVector2);
// Launch the Kernel on the GPU
size_t WorkSize[1] = {SIZE};
clEnqueueNDRangeKernel (cqCommandQueue, OpenCLVectorAdd, 1, NULL,
WorkSize, NULL, 0, NULL, NULL);
// Copy the output in GPU memory back to CPU memory
int HostOutputVector[SIZE];
clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0,
SIZE * sizeof(int), HostOutputVector, 0, NULL, NULL);
// Cleanup
clReleaseKernel(OpenCLVectorAdd);
clReleaseProgram(OpenCLProgram);
clReleaseCommandQueue(cqCommandQueue);
clReleaseContext(GPUContext);
clReleaseMemObject(GPUVector1);
clReleaseMemObject(GPUVector2);
clReleaseMemObject(GPUOutputVector);
// Print out the results
for (int Rows = 0; Rows < (SIZE/20); Rows++, printf("\t")){
for(int c = 0; c <20; c++){
printf("%c",(char)HostOutputVector[Rows * 20 + c]);
}
}
printf("\n\nThe End\n\n");
return 0;
}
Now when I run the program it gives the following output:
Code:
./vectoradd1
The End
[james@james Desktop]$ ls -al vectoradd1
The whole section of print statements directly above the "The End" print statements are ignored. What is going on?
I have copied the source code line by line once and then I simply copied and pasted it a second time. Twice I checked this code!
The results are the same in either case.
It is skipping statements directly below the "Print out the results" comment line.
I learned from using source code debugger that it is going through that section below the "Print out the results" comment line. It is stepping that part of the code, but it is not printing out anything.
Of course my source code and output files were not vectoradd.cpp and vectoradd. They each had different names.
I used my own name not these. I compiled with g++. I am not sure what you mean when say the printf statement
does not look right. Use my link to go to the source and you will see that is also the syntax that I used.
// Initialize with some interesting repeating data
for ( int c = 0; c < SIZE;
HostVector1[c] = InitialData1[c%20],
HostVector2[c] = InitialData2[c%20],
c++);
But you shouldn't. The convention (idiom, normal style) is to leave non-loop variable related things to the loop body. That's just what's done in C code normally, it isn't opencl specific. If you don't follow the convention, your code looks "strange", and is harder for others to read.
That is corrected, but it is still putting out the wrong answers
Okay, I think that I have corrected the opencl source as you suggested. Please check and criticize if it is not right.
The source is
Code:
//*******************************************************************
// Demo OpenCL application to compute a simple vector addition
// computation between 2 arrays on the GPU
// ******************************************************************
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
// OpenCL source code
const char * OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
// Some interesting data for the vectors
int InitialData1[20]={37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17};
int InitialData2[20]={35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15};
// Number of elements in the vectors to be added
#define SIZE 2048
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
// Two integer source vectors in Host memory
int HostVector1[SIZE], HostVector2[SIZE];
// Initialize with some interesting repeating data
for ( int c = 0; c < SIZE; c++)
{
HostVector1[c] = InitialData1[c%20];
HostVector2[c] = InitialData2[c%20];
}
//Get an OpenCL platform
cl_platform_id cpPlatform;
clGetPlatformIDs (1, &cpPlatform, NULL);
// Get a GPU device
cl_device_id cdDevice;
clGetDeviceIDs (cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_context GPUContext = clCreateContextFromType (0, CL_DEVICE_TYPE_GPU, NULL, NULL,NULL);
// Create a command-queue on the GPU device
cl_command_queue cqCommandQueue = clCreateCommandQueue
(GPUContext, cdDevice, 0, NULL);
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector1, NULL);
cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector2, NULL);
// Allocate output memory on GPU
cl_mem GPUOutputVector = clCreateBuffer (GPUContext, CL_MEM_WRITE_ONLY,
sizeof (int) * SIZE, NULL, NULL);
// Create OpenCL program with source code
cl_program OpenCLProgram = clCreateProgramWithSource (GPUContext, 7,
OpenCLSource, NULL, NULL);
// Build the program (OpenCL JIT compilation)
clBuildProgram (OpenCLProgram, 0, NULL, NULL, NULL, NULL);
// Create a handle to the compiled OpenCL function (Kernel)
cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "VectorAdd", NULL);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&GPUOutputVector);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem),(void*)&GPUVector1);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem),(void*)&GPUVector2);
// Launch the Kernel on the GPU
size_t WorkSize[1] = {SIZE};
clEnqueueNDRangeKernel (cqCommandQueue, OpenCLVectorAdd, 1, NULL,
WorkSize, NULL, 0, NULL, NULL);
// Copy the output in GPU memory back to CPU memory
int HostOutputVector[SIZE];
clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0,
SIZE * sizeof(int), HostOutputVector, 0, NULL, NULL);
// Cleanup
clReleaseKernel(OpenCLVectorAdd);
clReleaseProgram(OpenCLProgram);
clReleaseCommandQueue(cqCommandQueue);
clReleaseContext(GPUContext);
clReleaseMemObject(GPUVector1);
clReleaseMemObject(GPUVector2);
clReleaseMemObject(GPUOutputVector);
// Print out the results
for (int rows = 0; rows < (SIZE/20); rows++,
printf("\t")){
for(int c = 0; c <20; c++){
printf("%d",HostOutputVector[rows * 20 + c]);
}
}
printf("\n\nThe End\n\n");
return 0;
}
But it still is not outputing the correct answers. The output looks like this:
and I know that is not correct. All the vectros are not zero!
It nows prints them out, but does not print out the correct values.
What is wrong?
I have seen this specific source code in many places on the internet. None of those sites
have code that when you copy it and compile it and run it - it works.
LinuxQuestions.org is looking for people interested in writing
Editorials, Articles, Reviews, and more. If you'd like to contribute
content, let us know.