-
-
Save mcleary/2ecdf5edbe6d16426655ab1711a3039f to your computer and use it in GitHub Desktop.
| #include <iostream> | |
| #include <ctime> | |
| #ifdef __APPLE__ | |
| #include <OpenCL/cl.hpp> | |
| #else | |
| #include <CL/cl.hpp> | |
| #endif | |
| #define NUM_GLOBAL_WITEMS 1024 | |
| void compareResults(double CPUtime, double GPUtime, int trial) { | |
| double time_ratio = (CPUtime / GPUtime); | |
| std::cout << "VERSION " << trial << " -----------" << std::endl; | |
| std::cout << "CPU time: " << CPUtime << std::endl; | |
| std::cout << "GPU time: " << GPUtime << std::endl; | |
| std::cout << "GPU is "; | |
| if (time_ratio > 1) | |
| std::cout << time_ratio << " times faster!" << std::endl; | |
| else | |
| std::cout << (1 / time_ratio) << " times slower :(" << std::endl; | |
| } | |
| double timeAddVectorsCPU(int n, int k) { | |
| // adds two vectors of size n, k times, returns total duration | |
| std::clock_t start; | |
| double duration; | |
| std::vector<int> A(n); | |
| std::vector<int> B(n); | |
| std::vector<int> C(n); | |
| for (int i = 0; i < n; i++) { | |
| A[i] = i; | |
| B[i] = n - i; | |
| C[i] = 0; | |
| } | |
| start = std::clock(); | |
| for (int i = 0; i < k; i++) { | |
| for (int j = 0; j < n; j++) | |
| C[j] = A[j] + B[j]; | |
| } | |
| duration = (std::clock() - start) / (double)CLOCKS_PER_SEC; | |
| return duration; | |
| } | |
| void warmup(cl::Context& context, cl::CommandQueue& queue, | |
| cl::Kernel& add, int A[], int B[], int n) { | |
| std::vector<int> C(n); | |
| // allocate space | |
| cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
| cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
| cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
| // push write commands to queue | |
| queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * n, A); | |
| queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * n, B); | |
| // RUN ZE KERNEL | |
| add.setArg(1, buffer_B); | |
| add.setArg(0, buffer_A); | |
| add.setArg(2, buffer_C); | |
| for (int i = 0; i < 5; i++) | |
| queue.enqueueNDRangeKernel(add, cl::NullRange, cl::NDRange(NUM_GLOBAL_WITEMS), cl::NDRange(32)); | |
| queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * n, C.data()); | |
| queue.finish(); | |
| } | |
| int main(int argc, char* argv[]) | |
| { | |
| bool verbose; | |
| if (argc == 1 || std::strcmp(argv[1], "0") == 0) | |
| verbose = true; | |
| else | |
| verbose = false; | |
| verbose = 1; | |
| const int n = 8 * 32 * 512; // size of vectors | |
| const int k = 1000; // number of loop iterations | |
| // const int NUM_GLOBAL_WITEMS = 1024; // number of threads | |
| // get all platforms (drivers), e.g. NVIDIA | |
| std::vector<cl::Platform> all_platforms; | |
| cl::Platform::get(&all_platforms); | |
| if (all_platforms.size() == 0) { | |
| std::cout << " No platforms found. Check OpenCL installation!\n"; | |
| exit(1); | |
| } | |
| cl::Platform default_platform = all_platforms[1]; | |
| std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n"; | |
| // get default device (CPUs, GPUs) of the default platform | |
| std::vector<cl::Device> all_devices; | |
| default_platform.getDevices(CL_DEVICE_TYPE_GPU, &all_devices); | |
| if (all_devices.size() == 0) { | |
| std::cout << " No devices found. Check OpenCL installation!\n"; | |
| exit(1); | |
| } | |
| cl::Device default_device = all_devices[0]; | |
| std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n"; | |
| cl::Context context({ default_device }); | |
| cl::Program::Sources sources; | |
| // calculates for each element; C = A + B | |
| std::string kernel_code = | |
| " void kernel add(global const int* v1, global const int* v2, global int* v3) {" | |
| " int ID;" | |
| " ID = get_global_id(0);" | |
| " v3[ID] = v1[ID] + v2[ID];" | |
| " }" | |
| "" | |
| " void kernel add_looped_1(global const int* v1, global const int* v2, global int* v3, " | |
| " const int n, const int k) {" | |
| " int ID, NUM_GLOBAL_WITEMS, ratio, start, stop;" | |
| " ID = get_global_id(0);" | |
| " NUM_GLOBAL_WITEMS = get_global_size(0);" | |
| "" | |
| " ratio = (n / NUM_GLOBAL_WITEMS);" // elements per thread | |
| " start = ratio * ID;" | |
| " stop = ratio * (ID+1);" | |
| "" | |
| " int i, j;" // will the compiler optimize this anyway? probably. | |
| " for (i=0; i<k; i++) {" | |
| " for (j=start; j<stop; j++)" | |
| " v3[j] = v1[j] + v2[j];" | |
| " }" | |
| " }" | |
| "" | |
| " void kernel add_looped_2(global const int* v1, global const int* v2, global int* v3," | |
| " const int n, const int k) {" | |
| " int ID, NUM_GLOBAL_WITEMS, step;" | |
| " ID = get_global_id(0);" | |
| " NUM_GLOBAL_WITEMS = get_global_size(0);" | |
| " step = (n / NUM_GLOBAL_WITEMS);" | |
| "" | |
| " int i,j;" | |
| " for (i=0; i<k; i++) {" | |
| " for (j=ID; j<n; j+=step)" | |
| " v3[j] = v1[j] + v2[j];" | |
| " }" | |
| " }" | |
| "" | |
| " void kernel add_single(global const int* v1, global const int* v2, global int* v3, " | |
| " const int k) { " | |
| " int ID = get_global_id(0);" | |
| " for (int i=0; i<k; i++)" | |
| " v3[ID] = v1[ID] + v2[ID];" | |
| " }"; | |
| sources.push_back({ kernel_code.c_str(), kernel_code.length() }); | |
| cl::Program program(context, sources); | |
| if (program.build({ default_device }) != CL_SUCCESS) { | |
| std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl; | |
| exit(1); | |
| } | |
| // run the CPU code | |
| float CPUtime = timeAddVectorsCPU(n, k); | |
| // set up kernels and vectors for GPU code | |
| cl::CommandQueue queue(context, default_device); | |
| cl::Kernel add = cl::Kernel(program, "add"); | |
| cl::Kernel add_looped_1 = cl::Kernel(program, "add_looped_1"); | |
| cl::Kernel add_looped_2 = cl::Kernel(program, "add_looped_2"); | |
| cl::Kernel add_single = cl::Kernel(program, "add_single"); | |
| // construct vectors | |
| std::vector<int> A(n); | |
| std::vector<int> B(n); | |
| std::vector<int> C(n); | |
| for (int i = 0; i < n; i++) { | |
| A[i] = i; | |
| B[i] = n - i - 1; | |
| } | |
| // attempt at warm-up... | |
| warmup(context, queue, add, A.data(), B.data(), n); | |
| queue.finish(); | |
| std::clock_t start_time; | |
| // VERSION 1 ========================================== | |
| // start timer | |
| double GPUtime1; | |
| start_time = std::clock(); | |
| // allocate space | |
| cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
| cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
| cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
| // push write commands to queue | |
| queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * n, A.data()); | |
| queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * n, B.data()); | |
| // RUN ZE KERNEL | |
| add_looped_1.setArg(0, buffer_A); | |
| add_looped_1.setArg(1, buffer_B); | |
| add_looped_1.setArg(2, buffer_C); | |
| add_looped_1.setArg(3, n); | |
| add_looped_1.setArg(4, k); | |
| queue.enqueueNDRangeKernel(add_looped_1, cl::NullRange, // kernel, offset | |
| cl::NDRange(NUM_GLOBAL_WITEMS), // global number of work items | |
| cl::NDRange(32)); // local number (per group) | |
| // read result from GPU to here; including for the sake of timing | |
| queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * n, C.data()); | |
| queue.finish(); | |
| GPUtime1 = (std::clock() - start_time) / (double)CLOCKS_PER_SEC; | |
| // VERSION 2 ========================================== | |
| double GPUtime2; | |
| cl::Buffer buffer_A2(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
| cl::Buffer buffer_B2(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
| cl::Buffer buffer_C2(context, CL_MEM_READ_WRITE, sizeof(int) * n); | |
| queue.enqueueWriteBuffer(buffer_A2, CL_TRUE, 0, sizeof(int) * n, A.data()); | |
| queue.enqueueWriteBuffer(buffer_B2, CL_TRUE, 0, sizeof(int) * n, B.data()); | |
| start_time = std::clock(); | |
| add_looped_2.setArg(0, buffer_A2); | |
| add_looped_2.setArg(1, buffer_B2); | |
| add_looped_2.setArg(2, buffer_C2); | |
| add_looped_2.setArg(3, n); | |
| add_looped_2.setArg(4, k); | |
| queue.enqueueNDRangeKernel(add_looped_2, cl::NullRange, cl::NDRange(NUM_GLOBAL_WITEMS), cl::NDRange(32)); | |
| queue.enqueueReadBuffer(buffer_C2, CL_TRUE, 0, sizeof(int) * n, C.data()); | |
| queue.finish(); | |
| GPUtime2 = (std::clock() - start_time) / (double)CLOCKS_PER_SEC; | |
| // let's compare! | |
| const int NUM_VERSIONS = 2; | |
| double GPUtimes[NUM_VERSIONS] = { GPUtime1, GPUtime2 }; | |
| if (verbose) { | |
| for (int i = 0; i < NUM_VERSIONS; i++) | |
| compareResults(CPUtime, GPUtimes[i], i + 1); | |
| } | |
| else { | |
| std::cout << CPUtime << ","; | |
| for (int i = 0; i < NUM_VERSIONS - 1; i++) | |
| std::cout << GPUtimes[i] << ","; | |
| std::cout << GPUtimes[NUM_VERSIONS - 1] << std::endl; | |
| } | |
| return 0; | |
| } |
I started to read you code because of your Timer class and you don't even use it here ;-)
https://gist.github.com/mcleary/b0bf4fa88830ff7c882d
That is true, however, this OpenCL sample was meant to be used as single file to test OpenCL compilation and stuff like that. I did the Timer class as a gist for quick reference, but I do use a similar version in my Atmosphere demo here: https://github.com/mcleary/pbr/blob/master/pbr/main.cpp
Another thing, I wrote that Timer class 4 years ago. There was a lot I didn't know at the time on how to use std::chrono
Don't worry, all feedback is welcomed.
- When I opened some classes, I noticed that the style was not the same
everywhere,
I know that and I also use clang_format extensively but I didn't bother to use it in my personal project
virtualandoverrideare redondant, only the second one really
matters. If it overrides, it must be a virtual function
I know that as well but I probably didn't at the time I wrote the code for the first time
I started to read you code because of your Timer class and you don't even use it here ;-)
https://gist.github.com/mcleary/b0bf4fa88830ff7c882d