在前面的基础上:
此时我们知道有可能能经过某种神奇的操作,使得GPU在某种情况下,的确能够比CPU计算的快一些。但是,某些情况下,GPU速度反而不如CPU,使得在说出这个结论时总感觉有些不是很畅快。
或者说这是一两朵小乌云。
那么是不是记错了呢?记忆有时候具有欺骗性,这里本文决定重现一下:
CMakeLists.txt:
cmake_minimum_required
(
VERSION
3.17
)
project
(
vector_add
CXX
)
set
(
CUDA_ENABLE
true
)
if
(
CUDA_ENABLE
)
enable_language
(
CUDA
)
endif
()
set
(
MPI_ENABLE
true
)
set
(
PRJ_SRC_LIST
)
set
(
PRJ_HEADER_LIST
)
set
(
PRJ_LIBRARIES
)
set
(
PRJ_INCLUDE_DIRS
)
file
(
GLOB
root_header_files
"${CMAKE_CURRENT_SOURCE_DIR}/*.h"
)
file
(
GLOB
root_src_files
"${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
)
file
(
GLOB
root_cuda_files
"${CMAKE_CURRENT_SOURCE_DIR}/*.cu"
)
list
(
APPEND
PRJ_HEADER_LIST
${
root_header_files
}
)
list
(
APPEND
PRJ_SRC_LIST
${
root_src_files
}
)
list
(
APPEND
PRJ_SRC_LIST
${
root_cuda_files
}
)
add_executable
(
${
PROJECT_NAME
}
${
PRJ_SRC_LIST
}
${
PRJ_HEADER_LIST
}
)
target_include_directories
(
${
PROJECT_NAME
}
PRIVATE
${
PRJ_INCLUDE_DIRS
}
)
target_compile_features
(
${
PROJECT_NAME
}
PUBLIC
cuda_std_14
cxx_std_14
)
set_target_properties
(
${
PROJECT_NAME
}
PROPERTIES
#CUDA_ARCHITECTURES "50;75"
CUDA_ARCHITECTURES
"35;50;52;72;75"
CUDA_SEPARABLE_COMPILATION
ON
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
${
PRJ_LIBRARIES
}
)
kernel.cu:
#include
"cuda_runtime.h"
#include
<vector>
#include
<iostream>
#include
<ctime>
using
namespace
std
;
void
addWithCuda
(
int
*
a
,
int
*
b
,
int
*
c
,
unsigned
int
nElements
);
void
addWithCPU
(
int
*
a
,
int
*
b
,
int
*
c
,
unsigned
int
nElements
);
__global__
void
addKernel
(
int
*
a
,
int
*
b
,
int
*
c
);
void
TestAddTime
();
double
CpuSecond
();
double
CpuSecond
()
{
clock_t
now_time
=
clock
();
return
static_cast
<
double
>
(
now_time
)
/
CLOCKS_PER_SEC
;
}
void
SetDevice
(
int
devId
)
{
cudaDeviceProp
deviceProp
;
cudaGetDeviceProperties
(
&
deviceProp
,
devId
);
cout
<<
"Using device "
<<
devId
<<
": "
<<
deviceProp
.
name
<<
"
\n
"
;
cudaSetDevice
(
devId
);
}
int
main
()
{
TestAddTime
();
return
0
;
}
void
TestAddTime
()
{
int
arraySize
=
4096
*
4096
;
vector
<
int
>
a
(
arraySize
,
0
);
vector
<
int
>
b
(
arraySize
,
1
);
vector
<
int
>
c
(
arraySize
);
double
cpuStart
=
CpuSecond
();
addWithCPU
(
&
a
[
0
],
&
b
[
0
],
&
c
[
0
],
arraySize
);
double
cpuTime
=
CpuSecond
()
-
cpuStart
;
cout
<<
"CPU Execution Time: "
<<
cpuTime
<<
" sec
\n
"
;
double
gpuStart
=
CpuSecond
();
addWithCuda
(
&
a
[
0
],
&
b
[
0
],
&
c
[
0
],
arraySize
);
double
gpuTime
=
CpuSecond
()
-
gpuStart
;
cout
<<
"GPU Execution Time: "
<<
gpuTime
<<
" sec
\n
"
;
cudaDeviceReset
();
}
void
addWithCuda
(
int
*
a
,
int
*
b
,
int
*
c
,
unsigned
int
nElements
)
{
int
*
dev_a
=
0
;
int
*
dev_b
=
0
;
int
*
dev_c
=
0
;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaSetDevice
(
0
);
int
nBytes
=
nElements
*
sizeof
(
int
);
cudaMalloc
(
(
void
**
)
&
dev_a
,
nBytes
);
cudaMalloc
(
(
void
**
)
&
dev_b
,
nBytes
);
cudaMalloc
(
(
void
**
)
&
dev_c
,
nBytes
);
cudaMemcpy
(
dev_a
,
a
,
nBytes
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
dev_b
,
b
,
nBytes
,
cudaMemcpyHostToDevice
);
// Launch a kernel on the GPU with one thread for each element.
addKernel
<<<
1
,
nElements
>>>
(
dev_a
,
dev_b
,
dev_c
);
cudaDeviceSynchronize
();
cudaMemcpy
(
c
,
dev_c
,
nBytes
,
cudaMemcpyDeviceToHost
);
cudaFree
(
dev_c
);
cudaFree
(
dev_a
);
cudaFree
(
dev_b
);
}
void
addWithCPU
(
int
*
a
,
int
*
b
,
int
*
c
,
unsigned
int
nElements
)
{
for
(
int
i
=
0
;
i
<
nElements
;
++
i
)
{
c
[
i
]
=
a
[
i
]
+
b
[
i
];
}
}
__global__
void
addKernel
(
int
*
a
,
int
*
b
,
int
*
c
)
{
int
i
=
threadIdx
.
x
;
c
[
i
]
=
a
[
i
]
+
b
[
i
];
}
编译运行:
已启动重新生成…
1>------ 已启动全部重新生成: 项目: ZERO_CHECK, 配置: Debug x64 ------
1>Checking Build System
2>------ 已启动全部重新生成: 项目: vector_add, 配置: Debug x64 ------
2>Building Custom Rule D:/work/cuda_work/VectorAddSpeedTest/CMakeLists.txt
2>Compiling CUDA source file ..\kernel.cu...
2>
2>d:\work\cuda_work\VectorAddSpeedTest\build>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\nvcc.exe" -gencode=arch=compute_35,code=\"compute_35,compute_35\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -gencode=arch=compute_50,code=\"compute_50,compute_50\" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_52,code=\"compute_52,compute_52\" -gencode=arch=compute_52,code=\"sm_52,compute_52\" -gencode=arch=compute_72,code=\"compute_72,compute_72\" -gencode=arch=compute_72,code=\"sm_72,compute_72\" -gencode=arch=compute_75,code=\"compute_75,compute_75\" -gencode=arch=compute_75,code=\"sm_75,compute_75\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\bin\HostX64\x64" -x cu -rdc=true -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\include" --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -std=c++14 -Xcompiler="/EHsc -Zi -Ob0" -g -D_WINDOWS -D"CMAKE_INTDIR=\"Debug\"" -D_MBCS -D"CMAKE_INTDIR=\"Debug\"" -Xcompiler "/EHsc /W1 /nologo /Od /Fdvector_add.dir\Debug\vc142.pdb /FS /Zi /RTC1 /MDd /GR" -o vector_add.dir\Debug\kernel.obj "d:\work\cuda_work\VectorAddSpeedTest\kernel.cu"
2>CUDACOMPILE : nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
2>kernel.cu
2>已完成生成项目「vector_add.vcxproj」的操作。
2>
2>d:\work\cuda_work\VectorAddSpeedTest\build>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\nvcc.exe" -dlink -o vector_add.dir\Debug\vector_add.device-link.obj -Xcompiler "/EHsc /W1 /nologo /Od /Fdvector_add.dir\Debug\vc142.pdb /Zi /RTC1 /MDd /GR" -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin/crt" -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\lib\x64" cudadevrt.lib cudart_static.lib kernel32.lib user32.lib gdi32.lib winspool.lib shell32.lib ole32.lib oleaut32.lib uuid.lib comdlg32.lib advapi32.lib -forward-unknown-to-host-compiler -Wno-deprecated-gpu-targets -gencode=arch=compute_35,code=compute_35 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=compute_50 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=compute_52 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_72,code=compute_72 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=compute_75 -gencode=arch=compute_75,code=sm_75 --machine 64 vector_add.dir\Debug\kernel.obj
2>cudadevrt.lib
2>cudart_static.lib
2>kernel32.lib
2>user32.lib
2>gdi32.lib
2>winspool.lib
2>shell32.lib
2>ole32.lib
2>oleaut32.lib
2>uuid.lib
2>comdlg32.lib
2>advapi32.lib
2>kernel.obj
2> 正在创建库 D:/work/cuda_work/VectorAddSpeedTest/build/Debug/vector_add.lib 和对象 D:/work/cuda_work/VectorAddSpeedTest/build/Debug/vector_add.exp
2>vector_add.vcxproj -> D:\work\cuda_work\VectorAddSpeedTest\build\Debug\vector_add.exe
3>------ 已跳过全部重新生成: 项目: ALL_BUILD, 配置: Debug x64 ------
3>没有为此解决方案配置选中要生成的项目
========== 全部重新生成: 成功 2 个,失败 0 个,跳过 1 个 ==========
GPU比CPU慢了10倍不止,虽然看不太懂,但是大受震撼。
难不成debu模式不可靠?换release:
改变问题规模:
从
改为:
此时,有(debug模式)
Release:
可见这种规模下GPU的速度的确要快一些。可见这是个和规模相关的问题,那么再将问题规模缩小一些:
此时,有(debug模式)
Release:
再增加一些规模:
此时,有(debug模式)
Release:
应该指出的是,这个计时可能不是完全准确的,但是可以提供一种参考。
下面进一步退回到前面GPU比CPU计算慢的部分,看看问题的原因是什么?
可以看到至少在addWithCuda函数里面有:
这种和设备相关的操作会不会占用时间呢?
先测试一下时间:
将代码重构为:
运行,有:
可见,这个设备的打开占用了很多时间。
这给了一个重构方向,是不是还有很多和计算无关的部分把关心的问题掩盖了?
继续:
这时GPU的时间又显著减少了。说明内存分配这些杂事比较耗费时间。
继续沿此思路重构:
此时有:
继续:
说明copy这些操作很耗时。
继续重构,让CPU,GPU各循环10次,有:
此时:
这样,通过以上步骤,基本上测试了GPU矢量相加的一些时间消耗的问题,使得其与CPU的耗费之间的关系有了一些可以接受的解释。感兴趣者可以沿此思路进行更细致的研究。
为便于检索,文章收录于: