当前位置: 华文星空 > 知识

CUDA入门到精通(12)CMake编译运行GPU矢量加法CPU和GPU速度对比测试

2022-08-04知识

在前面的基础上:

此时我们知道有可能能经过某种神奇的操作,使得GPU在某种情况下,的确能够比CPU计算的快一些。但是,某些情况下,GPU速度反而不如CPU,使得在说出这个结论时总感觉有些不是很畅快。

或者说这是一两朵小乌云。

那么是不是记错了呢?记忆有时候具有欺骗性,这里本文决定重现一下:

CMakeLists.txt:

cmake_minimum_required ( VERSION 3.17 ) project ( vector_add CXX ) set ( CUDA_ENABLE true ) if ( CUDA_ENABLE ) enable_language ( CUDA ) endif () set ( MPI_ENABLE true ) set ( PRJ_SRC_LIST ) set ( PRJ_HEADER_LIST ) set ( PRJ_LIBRARIES ) set ( PRJ_INCLUDE_DIRS ) file ( GLOB root_header_files "${CMAKE_CURRENT_SOURCE_DIR}/*.h" ) file ( GLOB root_src_files "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" ) file ( GLOB root_cuda_files "${CMAKE_CURRENT_SOURCE_DIR}/*.cu" ) list ( APPEND PRJ_HEADER_LIST ${ root_header_files } ) list ( APPEND PRJ_SRC_LIST ${ root_src_files } ) list ( APPEND PRJ_SRC_LIST ${ root_cuda_files } ) add_executable ( ${ PROJECT_NAME } ${ PRJ_SRC_LIST } ${ PRJ_HEADER_LIST } ) target_include_directories ( ${ PROJECT_NAME } PRIVATE ${ PRJ_INCLUDE_DIRS } ) target_compile_features ( ${ PROJECT_NAME } PUBLIC cuda_std_14 cxx_std_14 ) set_target_properties ( ${ PROJECT_NAME } PROPERTIES #CUDA_ARCHITECTURES "50;75" CUDA_ARCHITECTURES "35;50;52;72;75" CUDA_SEPARABLE_COMPILATION ON ) target_link_libraries ( ${ PROJECT_NAME } PRIVATE ${ PRJ_LIBRARIES } )

kernel.cu:

#include "cuda_runtime.h" #include <vector> #include <iostream> #include <ctime> using namespace std ; void addWithCuda ( int * a , int * b , int * c , unsigned int nElements ); void addWithCPU ( int * a , int * b , int * c , unsigned int nElements ); __global__ void addKernel ( int * a , int * b , int * c ); void TestAddTime (); double CpuSecond (); double CpuSecond () { clock_t now_time = clock (); return static_cast < double > ( now_time ) / CLOCKS_PER_SEC ; } void SetDevice ( int devId ) { cudaDeviceProp deviceProp ; cudaGetDeviceProperties ( & deviceProp , devId ); cout << "Using device " << devId << ": " << deviceProp . name << " \n " ; cudaSetDevice ( devId ); } int main () { TestAddTime (); return 0 ; } void TestAddTime () { int arraySize = 4096 * 4096 ; vector < int > a ( arraySize , 0 ); vector < int > b ( arraySize , 1 ); vector < int > c ( arraySize ); double cpuStart = CpuSecond (); addWithCPU ( & a [ 0 ], & b [ 0 ], & c [ 0 ], arraySize ); double cpuTime = CpuSecond () - cpuStart ; cout << "CPU Execution Time: " << cpuTime << " sec \n " ; double gpuStart = CpuSecond (); addWithCuda ( & a [ 0 ], & b [ 0 ], & c [ 0 ], arraySize ); double gpuTime = CpuSecond () - gpuStart ; cout << "GPU Execution Time: " << gpuTime << " sec \n " ; cudaDeviceReset (); } void addWithCuda ( int * a , int * b , int * c , unsigned int nElements ) { int * dev_a = 0 ; int * dev_b = 0 ; int * dev_c = 0 ; // Choose which GPU to run on, change this on a multi-GPU system. cudaSetDevice ( 0 ); int nBytes = nElements * sizeof ( int ); cudaMalloc ( ( void ** ) & dev_a , nBytes ); cudaMalloc ( ( void ** ) & dev_b , nBytes ); cudaMalloc ( ( void ** ) & dev_c , nBytes ); cudaMemcpy ( dev_a , a , nBytes , cudaMemcpyHostToDevice ); cudaMemcpy ( dev_b , b , nBytes , cudaMemcpyHostToDevice ); // Launch a kernel on the GPU with one thread for each element. addKernel <<< 1 , nElements >>> ( dev_a , dev_b , dev_c ); cudaDeviceSynchronize (); cudaMemcpy ( c , dev_c , nBytes , cudaMemcpyDeviceToHost ); cudaFree ( dev_c ); cudaFree ( dev_a ); cudaFree ( dev_b ); } void addWithCPU ( int * a , int * b , int * c , unsigned int nElements ) { for ( int i = 0 ; i < nElements ; ++ i ) { c [ i ] = a [ i ] + b [ i ]; } } __global__ void addKernel ( int * a , int * b , int * c ) { int i = threadIdx . x ; c [ i ] = a [ i ] + b [ i ]; }

编译运行:

已启动重新生成… 1>------ 已启动全部重新生成: 项目: ZERO_CHECK, 配置: Debug x64 ------ 1>Checking Build System 2>------ 已启动全部重新生成: 项目: vector_add, 配置: Debug x64 ------ 2>Building Custom Rule D:/work/cuda_work/VectorAddSpeedTest/CMakeLists.txt 2>Compiling CUDA source file ..\kernel.cu... 2> 2>d:\work\cuda_work\VectorAddSpeedTest\build>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\nvcc.exe" -gencode=arch=compute_35,code=\"compute_35,compute_35\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -gencode=arch=compute_50,code=\"compute_50,compute_50\" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_52,code=\"compute_52,compute_52\" -gencode=arch=compute_52,code=\"sm_52,compute_52\" -gencode=arch=compute_72,code=\"compute_72,compute_72\" -gencode=arch=compute_72,code=\"sm_72,compute_72\" -gencode=arch=compute_75,code=\"compute_75,compute_75\" -gencode=arch=compute_75,code=\"sm_75,compute_75\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\bin\HostX64\x64" -x cu -rdc=true -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\include" --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -std=c++14 -Xcompiler="/EHsc -Zi -Ob0" -g -D_WINDOWS -D"CMAKE_INTDIR=\"Debug\"" -D_MBCS -D"CMAKE_INTDIR=\"Debug\"" -Xcompiler "/EHsc /W1 /nologo /Od /Fdvector_add.dir\Debug\vc142.pdb /FS /Zi /RTC1 /MDd /GR" -o vector_add.dir\Debug\kernel.obj "d:\work\cuda_work\VectorAddSpeedTest\kernel.cu" 2>CUDACOMPILE : nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). 2>kernel.cu 2>已完成生成项目「vector_add.vcxproj」的操作。 2> 2>d:\work\cuda_work\VectorAddSpeedTest\build>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\nvcc.exe" -dlink -o vector_add.dir\Debug\vector_add.device-link.obj -Xcompiler "/EHsc /W1 /nologo /Od /Fdvector_add.dir\Debug\vc142.pdb /Zi /RTC1 /MDd /GR" -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin/crt" -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\lib\x64" cudadevrt.lib cudart_static.lib kernel32.lib user32.lib gdi32.lib winspool.lib shell32.lib ole32.lib oleaut32.lib uuid.lib comdlg32.lib advapi32.lib -forward-unknown-to-host-compiler -Wno-deprecated-gpu-targets -gencode=arch=compute_35,code=compute_35 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=compute_50 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=compute_52 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_72,code=compute_72 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=compute_75 -gencode=arch=compute_75,code=sm_75 --machine 64 vector_add.dir\Debug\kernel.obj 2>cudadevrt.lib 2>cudart_static.lib 2>kernel32.lib 2>user32.lib 2>gdi32.lib 2>winspool.lib 2>shell32.lib 2>ole32.lib 2>oleaut32.lib 2>uuid.lib 2>comdlg32.lib 2>advapi32.lib 2>kernel.obj 2> 正在创建库 D:/work/cuda_work/VectorAddSpeedTest/build/Debug/vector_add.lib 和对象 D:/work/cuda_work/VectorAddSpeedTest/build/Debug/vector_add.exp 2>vector_add.vcxproj -> D:\work\cuda_work\VectorAddSpeedTest\build\Debug\vector_add.exe 3>------ 已跳过全部重新生成: 项目: ALL_BUILD, 配置: Debug x64 ------ 3>没有为此解决方案配置选中要生成的项目 ========== 全部重新生成: 成功 2 个,失败 0 个,跳过 1 个 ==========

GPU比CPU慢了10倍不止,虽然看不太懂,但是大受震撼。

难不成debu模式不可靠?换release:

改变问题规模:

改为:

此时,有(debug模式)

Release:

可见这种规模下GPU的速度的确要快一些。可见这是个和规模相关的问题,那么再将问题规模缩小一些:

此时,有(debug模式)

Release:

再增加一些规模:

此时,有(debug模式)

Release:

应该指出的是,这个计时可能不是完全准确的,但是可以提供一种参考。

下面进一步退回到前面GPU比CPU计算慢的部分,看看问题的原因是什么?

可以看到至少在addWithCuda函数里面有:

这种和设备相关的操作会不会占用时间呢?

先测试一下时间:

将代码重构为:

运行,有:

可见,这个设备的打开占用了很多时间。

这给了一个重构方向,是不是还有很多和计算无关的部分把关心的问题掩盖了?

继续:

这时GPU的时间又显著减少了。说明内存分配这些杂事比较耗费时间。

继续沿此思路重构:

此时有:

继续:

说明copy这些操作很耗时。

继续重构,让CPU,GPU各循环10次,有:

此时:

这样,通过以上步骤,基本上测试了GPU矢量相加的一些时间消耗的问题,使得其与CPU的耗费之间的关系有了一些可以接受的解释。感兴趣者可以沿此思路进行更细致的研究。

为便于检索,文章收录于: