PCL+CUDA編程（一）

本文轉載自查看原文 2016-11-30 14:29 6629 CUDA/ PCL

點雲的操作對運算資源的消耗是十分高的。但利用GPU並行運算的優點可以解決這個問題。下面我將跟大家分享關於利用CUDA處理PCL點雲數據的一些經驗。

首先舉一個簡單的例子說明CUDA程序是如何運作的。

我們先寫一個簡單的C++程序helloworld.cpp

 1 /*
 2  * helloworld.cpp
 3  *
 4  *  Created on: Nov 25, 2016
 5  *      Author: lzp
 6  */
 7 
 8 #include <iostream>
 9 
10 #include <addition.h>
11 
12 
13 int main(int argc, char** argv)
14 {
15     int a=1,b=2,c;
16 
17     if(addition(a,b,&c))
18         std::cout<<"c="<<c<<std::endl;
19     else
20         std::cout<<"Addition failed!"<<std::endl;
21 
22     return 0;
23 }

我們將利用addition()函數將a和b相加，然后由c儲存它們的和。
addition()函數在頭文件聲明：

 1 /*
 2  * addition.h
 3  *
 4  *  Created on: Nov 25, 2016
 5  *      Author: lzp
 6  */
 7 
 8 #ifndef INCLUDES_ADDITION_H_
 9 #define INCLUDES_ADDITION_H_
10 
11 /*check if the compiler is of C++*/
12 #ifdef __cplusplus
13 extern "C" bool addition(int a, int b, int *c);
14 
15 #endif
16 
17 
18 
19 #endif /* INCLUDES_ADDITION_H_ */

修飾符extern "C"是CUDA和C++混合編程時必須的。然后我們來看addition()的在CUDA上的實現：

 1 #include <addition.h>
 2 __global__ void add(int *a, int *b, int *c)
 3 {
 4     *c=*a+*b;
 5 }
 6 
 7 extern "C" bool addition(int a, int b, int *c)
 8 {
 9     int *d_a, *d_b, *d_c;
10     int size=sizeof(int);
11     
12     cudaMalloc((void **)&d_a, size);
13     cudaMalloc((void **)&d_b, size);
14     cudaMalloc((void **)&d_c, size);
15     
16     cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
17     cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
18     
19     add<<<1,1>>>(d_a, d_b, d_c);
20     
21     cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
22     
23     cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
24     return true;
25 }

其中，帶有__global__修飾符的函數稱為”核函數“，它負責處理GPU內存里的數據，是並行計算發生的地方。而bool addition(int a, int b, int *c)充當了CPU和GPU之間數據傳輸的角色。也就是Host和Device之間的數據傳輸。

最后，編寫CMake文件編譯。

cmake_minimum_required(VERSION 2.6 FATAL_ERROR)

project(helloworld)

find_package(CUDA REQUIRED)

include_directories(../../includes)

cuda_add_executable (helloworld helloworld.cpp addition.cu)

其中include_directories的參數為.h文件所在的目錄。

下面我們用相同的程序結構，寫一個最簡單的例子，用CUDA對PCL點雲中的一個點進行操作。

 1 /*
 2  * pcl_points_gpu.cpp
 3  *
 4  *  Created on: Nov 24, 2016
 5  *      Author: lzp
 6  */
 7 
 8 
 9 
10 #include <gpu_draw_cloud.h>
11 #include <pcl/io/pcd_io.h>
12 
13 int main(int argc, char** argv)
14 {
15     pcl::PointCloud<pcl::PointXYZRGB> cloud;
16     pcl::gpu::DeviceArray<pcl::PointXYZRGB> cloud_device;
17 
18 
19     cloud.width = 1;
20     cloud.height =1;
21     cloud.is_dense=false;
22     cloud.points.resize(cloud.width*cloud.height);
23 
24     std::vector<float> point_val;
25 
26     for(size_t i=0; i<3*cloud.points.size(); ++i)
27     {
28         point_val.push_back(1024*rand()/(RAND_MAX+1.0f));
29     }
30 
31     for (size_t i = 0; i < cloud.points.size(); ++i) {
32         cloud.points[i].x = point_val[3 * i];
33         cloud.points[i].y = point_val[3 * i + 1];
34         cloud.points[i].z = point_val[3 * i + 2];
35     }
36 
37     std::cout<<"cloud.points="<<cloud.points[0]<<std::endl;
38 
39     cloud_device.upload(cloud.points);
40 
41     cloud2GPU(cloud_device);
42 
43     cloud_device.download(cloud.points);
44 
45     std::cout<<"cloud.points="<<cloud.points[0]<<std::endl;
46     return (0);
47 }

這段代碼模仿了PCL中寫點雲的一個例子，生成了一個點，坐標是隨機生成的。關鍵點是pcl::gpu::DeviceArray<pcl::PointXYZRGB>，這是一個可以將點雲傳輸到GPU上的橋梁。它的upload() 和download()方法相當於前面例子中的cudaMemcpy()。詳情可參考PCL的源碼倉庫中/gpu/examples/和/gpu/octree/這兩個目錄的源碼。

接下來是頭文件：

 1 /*
 2  * gpu_draw_cloud.h
 3  *
 4  *  Created on: Nov 25, 2016
 5  *      Author: lzp
 6  */
 7 
 8 #ifndef INCLUDES_GPU_DRAW_CLOUD_H_
 9 #define INCLUDES_GPU_DRAW_CLOUD_H_
10 
11 
12 #include <iostream>
13 #include <pcl/point_types.h>
14 #include <pcl/gpu/containers/device_array.h>
15 
16 /*check if the compiler is of C++*/
17 #ifdef __cplusplus
18 
19 
20 /*
21  * Try accessing GPU with pointcloud
22  * */
23 extern "C" bool cloud2GPU(pcl::gpu::DeviceArray<pcl::PointXYZRGB>& cloud_device);
24 
25 
26 #endif
27 
28 
29 #endif /* INCLUDES_GPU_DRAW_CLOUD_H_ */

然后是函數實現體：

 1 #include <gpu_draw_cloud.h>
 2 
 3 
 4 
 5 
 6 __global__ void change_points(pcl::gpu::PtrSz<pcl::PointXYZRGB> cloud_device)
 7 {
 8     cloud_device[0].x+=1;
 9     pcl::PointXYZRGB q=cloud_device.data[0];
10     printf("x=%f, y=%f, z=%f, r=%d, g=%d, b=%d \n", q.x, q.y, q.z, q.r, q.g, q.b);
11 }
12 
13 
14 
15 extern "C" bool 
16 cloud2GPU(pcl::gpu::DeviceArray<pcl::PointXYZRGB>& cloud_device)
17 {
18     change_points<<<1,1>>>(cloud_device);
19     return true;
20 }

在這個例子中，我將CPU和GPU的數據交互放到主函數中了，因此cloud2GPU函數只充當了一個調用核函數的接口。值得注意的是，在核函數的參數中，傳入的pcl::gpu::DeviceArray<pcl::PointXYZRGB>隱式轉換成pcl::gpu::PtrSz<pcl::PointXYZRGB>了。這兩個數據類型是實現C++和CUDA混合編程的關鍵。

最后附上CMakeLists。

 1 project(pcl_points_gpu)
 2 
 3 find_package(PCL 1.8 REQUIRED)
 4 find_package(CUDA REQUIRED)
 5 INCLUDE(FindCUDA)
 6 
 7 include_directories(../../includes)
 8 
 9 include_directories(${PCL_INCLUDE_DIRS})
10 link_directories(${PCL_LIBRARY_DIRS})
11 add_definitions(${PCL_DEFINITIONS})
12 
13 get_directory_property(dir_defs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
14 set(vtk_flags)
15 
16 foreach(it ${dir_defs})
17     if(it MATCHES "vtk*")
18     list(APPEND vtk_flags ${it})
19     endif()
20 endforeach()
21 
22 foreach(d ${vtk_flags})
23     remove_definitions(-D${d})
24 endforeach()
25 
26 cuda_add_executable (pcl_points_gpu pcl_points_gpu.cpp gpu_draw_cloud.cu)
27 target_link_libraries (pcl_points_gpu ${PCL_LIBRARIES})

留意13-24行，如果沒有這幾行，nvcc編譯時會報出類似這樣的錯誤：

nvcc fatal : A single input file is required for a non-link phase when an outputfile is specified
CMake Error at pcl_points_gpu_generated_gpu_draw_cloud.cu.o.cmake:209 (message):
Error generating
XXXXXXXXXXXXXXXX./pcl_points_gpu_generated_gpu_draw_cloud.cu.o

根據https://github.com/PointCloudLibrary/pcl/issues/776的描述，這是VTK的一個bug所致，因此在CMake中添加了這幾行腳本。

希望這些例子對剛接觸PCL和CUDA的人有幫助。本人也是新手，對很多概念仍然模糊不清，望體諒。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 cuda編程（一） [CUDA] CUDA編程入門 PCL常見編程問題 CUDA編程 CUDA編程 [CUDA]CUDA編程資源匯總 CUDA編程入門 CUDA編程學習筆記1 詳解CUDA編程 liunx的arm64框架安裝cuda+pcl