0. 简介

上一章我们主要讲了噪声滤除这种比较高级的点云处理算法,下面这一章我们将来看一下最近邻搜索。最近邻搜索(Nearest Neighbor Search)是一种广泛应用于机器学习、计算机视觉和自然语言处理等领域的算法。它的主要目的是找到一个给定数据集中与查询数据最相似的数据点。

1. CUDA与Thrust

在传统的CPU计算中,最近邻搜索通常需要对每个数据点与查询点进行一次比较。这样的计算复杂度在数据集较大时会变得非常高,导致查询时间非常慢。而使用CUDA和Thrust方法可以加速这个过程,大大提高最近邻搜索的效率。

CUDA是一种并行计算架构,它可以利用GPU的强大计算能力来加速各种计算任务。在最近邻搜索中,可以使用CUDA实现并行计算,将每个数据点与查询点之间的比较分配给GPU处理。这种方法可以大大缩短查询时间,并且可以处理大量的数据集。Thrust是一个基于CUDA的C++模板库,提供了一些高效的并行算法和数据结构。它可以轻松地将串行代码转换为并行代码,从而使代码更加高效。在最近邻搜索中,可以使用Thrust实现并行排序和并行查找算法,从而提高搜索效率。

2. CUDA代码完成加速

下面这段代码是基于哈希表的最近邻搜索算法的CUDA实现。该算法主要通过将三维点云分割成一个个小的栅格,并将点云放入栅格中,以此实现快速查找每个点云的最近邻。具体实现方法如下:

首先,将点云分为两部分,分别为d_first_point_cloudd_second_point_cloud,其中d_second_point_cloud为需要查找最近邻的点云。然后,通过调用CUDA核函数kernel_nearestNeighborSearch,对每个d_second_point_cloud中的点进行查找。

在核函数中,首先根据点的位置确定该点所在栅格的索引。如果该点不在栅格内,则将最近邻索引设为-1。接下来,遍历与该点所在栅格相邻的栅格,寻找距离该点最近的点,并返回该点的索引。

在遍历相邻的栅格时,考虑两种情况:如果栅格与该点所在栅格相同,则在栅格中查找距离该点最近的点,并将最近邻的索引存储在nn_index中;否则,在该栅格中找到一定数量的最近邻点,将它们与nn_index中的点进行比较,并更新nn_index中的最近邻点。

最后,将每个点的最近邻点的索引存储在d_nearest_neighbour_indexes数组中,以供后续使用。

/////////////////////////////////nearest neighbourhood search////////////////////////////////
/*
最近邻搜索核函数
param:
d_first_point_cloud:第一个点云
number_of_points_first_point_cloud:第一个点云的点数
d_second_point_cloud:第二个点云
number_of_points_second_point_cloud:第二个点云的点数
d_hashTable:hash表
d_buckets:栅格
rgd_params:网格参数
search_radius:搜索半径
max_number_considered_in_INNER_bucket:在内桶中考虑的最大点数
max_number_considered_in_OUTER_bucket:在外桶中考虑的最大点数
d_nearest_neighbour_indexes:最近邻索引
*/
__global__ void kernel_nearestNeighborSearch(
    pcl::PointXYZ *d_first_point_cloud,
    int number_of_points_first_point_cloud,
    pcl::PointXYZ *d_second_point_cloud,
    int number_of_points_second_point_cloud,
    hashElement *d_hashTable,
    bucket *d_buckets,
    gridParameters rgd_params,
    float search_radius,
    int max_number_considered_in_INNER_bucket,
    int max_number_considered_in_OUTER_bucket,
    int *d_nearest_neighbour_indexes)
{
    int index_of_point_second_point_cloud = blockIdx.x * blockDim.x + threadIdx.x; // 第二个点云的点索引

    if (index_of_point_second_point_cloud < number_of_points_second_point_cloud) // 如果索引小于第二个点云的点数
    {
        bool isok = false;

        float x = d_second_point_cloud[index_of_point_second_point_cloud].x;
        float y = d_second_point_cloud[index_of_point_second_point_cloud].y;
        float z = d_second_point_cloud[index_of_point_second_point_cloud].z;

        if (x < rgd_params.bounding_box_min_X || x > rgd_params.bounding_box_max_X) // 如果x坐标不在网格的范围内
        {
            d_nearest_neighbour_indexes[index_of_point_second_point_cloud] = -1; // 将最近邻索引设为-1
            return;
        }
        if (y < rgd_params.bounding_box_min_Y || y > rgd_params.bounding_box_max_Y)
        {
            d_nearest_neighbour_indexes[index_of_point_second_point_cloud] = -1;
            return;
        }
        if (z < rgd_params.bounding_box_min_Z || z > rgd_params.bounding_box_max_Z)
        {
            d_nearest_neighbour_indexes[index_of_point_second_point_cloud] = -1;
            return;
        }

        int ix = (x - rgd_params.bounding_box_min_X) / rgd_params.resolution_X; // 计算栅格的索引
        int iy = (y - rgd_params.bounding_box_min_Y) / rgd_params.resolution_Y;
        int iz = (z - rgd_params.bounding_box_min_Z) / rgd_params.resolution_Z;

        int index_bucket = ix * rgd_params.number_of_buckets_Y *
                               rgd_params.number_of_buckets_Z +
                           iy * rgd_params.number_of_buckets_Z + iz; // 计算栅格在整个队列中的索引
        int nn_index = -1;

        if (index_bucket >= 0 && index_bucket < rgd_params.number_of_buckets) //
        {
            int sx, sy, sz, stx, sty, stz;
            if (ix == 0)
                sx = 0;
            else
                sx = -1;
            if (iy == 0)
                sy = 0;
            else
                sy = -1;
            if (iz == 0)
                sz = 0;
            else
                sz = -1;

            if (ix == rgd_params.number_of_buckets_X - 1)
                stx = 1;
            else
                stx = 2;
            if (iy == rgd_params.number_of_buckets_Y - 1)
                sty = 1;
            else
                sty = 2;
            if (iz == rgd_params.number_of_buckets_Z - 1)
                stz = 1;
            else
                stz = 2;

            float _distance = 100000000.0f;
            int index_next_bucket;
            int iter;
            int number_of_points_in_bucket;
            int l_begin;
            int l_end;

            for (int i = sx; i < stx; i++)
            {
                for (int j = sy; j < sty; j++)
                {
                    for (int k = sz; k < stz; k++)
                    {
                        index_next_bucket = index_bucket +
                                            i * rgd_params.number_of_buckets_Y * rgd_params.number_of_buckets_Z +
                                            j * rgd_params.number_of_buckets_Z + k;
                        if (index_next_bucket >= 0 && index_next_bucket < rgd_params.number_of_buckets)
                        {
                            number_of_points_in_bucket = d_buckets[index_next_bucket].number_of_points;
                            if (number_of_points_in_bucket <= 0)
                                continue;

                            int max_number_considered_in_bucket;
                            if (index_next_bucket == index_bucket)
                            {
                                max_number_considered_in_bucket = max_number_considered_in_INNER_bucket;
                            }
                            else
                            {
                                max_number_considered_in_bucket = max_number_considered_in_OUTER_bucket;
                            }
                            if (max_number_considered_in_bucket <= 0)
                                continue;

                            if (max_number_considered_in_bucket >= number_of_points_in_bucket)
                            {
                                iter = 1;
                            }
                            else
                            {
                                iter = number_of_points_in_bucket / max_number_considered_in_bucket;
                                if (iter <= 0)
                                    iter = 1;
                            }

                            l_begin = d_buckets[index_next_bucket].index_begin;
                            l_end = d_buckets[index_next_bucket].index_end;

                            for (int l = l_begin; l < l_end; l += iter)
                            {
                                if (l >= 0 && l < number_of_points_first_point_cloud)
                                {
                                    int hashed_index_of_point = d_hashTable[l].index_of_point;
                                    // inA[hashed_index_of_point].var = 1;

                                    float nn_x = d_first_point_cloud[hashed_index_of_point].x;
                                    float nn_y = d_first_point_cloud[hashed_index_of_point].y;
                                    float nn_z = d_first_point_cloud[hashed_index_of_point].z;

                                    float dist = (x - nn_x) * (x - nn_x) +
                                                 (y - nn_y) * (y - nn_y) +
                                                 (z - nn_z) * (z - nn_z);

                                    if (dist <= search_radius * search_radius)
                                    {
                                        if (dist < _distance)
                                        {
                                            isok = true;
                                            nn_index = hashed_index_of_point;
                                            _distance = dist;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        if (isok)
        {
            d_nearest_neighbour_indexes[index_of_point_second_point_cloud] = nn_index;
        }
        else
        {
            d_nearest_neighbour_indexes[index_of_point_second_point_cloud] = -1;
        }
    }
}

然后我们下面的函数就是调用上面的函数一个CUDA并行的最近邻搜索算法。通过传入采用了两个点云,并将其中一个点云的每个点与第二个点云中最近的点进行匹配,即找到第二个点云中距离该点最近的点的索引。

cudaError_t cudaNearestNeighborSearch(
    int threads,
    pcl::PointXYZ *d_first_point_cloud,
    int number_of_points_first_point_cloud,
    pcl::PointXYZ *d_second_point_cloud,
    int number_of_points_second_point_cloud,
    hashElement *d_hashTable,
    bucket *d_buckets,
    gridParameters rgd_params,
    float search_radius,
    int max_number_considered_in_INNER_bucket,
    int max_number_considered_in_OUTER_bucket,
    int *d_nearest_neighbour_indexes)
{
    cudaError_t err = cudaGetLastError();

    int blocks = number_of_points_second_point_cloud / threads + 1;

    kernel_nearestNeighborSearch<<<blocks, threads>>>(
        d_first_point_cloud,
        number_of_points_first_point_cloud,
        d_second_point_cloud,
        number_of_points_second_point_cloud,
        d_hashTable,
        d_buckets,
        rgd_params,
        search_radius,
        max_number_considered_in_INNER_bucket,
        max_number_considered_in_OUTER_bucket,
        d_nearest_neighbour_indexes);

    err = cudaDeviceSynchronize();
    return err;
}

该函数为在CUDA平台上实现最近邻搜索(nearest neighbor search)算法。其输入参数为两个点云(pcl::PointCloudpcl::PointXYZ &first_point_cloud,pcl::PointCloudpcl::PointXYZ &second_point_cloud),搜索半径search_radiusBounding box extension参数,以及max_number_considered_in_INNER_bucketmax_number_considered_in_OUTER_bucket,这两个参数是用于限制每个bucket中内部和外部考虑的最大点数。最后一个参数nearest_neighbour_indexes为最近邻索引结果的输出。

函数的大致流程为:

  1. 检查最近邻索引的大小是否等于第二个点云的大小,如果不相等,返回false。
  2. 设置GPU设备,并且打印当前可用的设备线程数。
  3. 为第一个点云和第二个点云在GPU中分配内存,并将它们的数据从主机内存复制到GPU内存中。
  4. 计算网格参数,并在GPU中分配内存来存储哈希表和bucket
  5. 在GPU中计算网格并计算最近邻索引。
  6. 将最近邻索引从GPU内存复制回主机内存中,存储在nearest_neighbour_indexes中。
  7. 使用了OpenGL来显示点云数据。点云数据由两个不同的点云组成:first_point_cloudsecond_point_cloud。在循环中,对于第二个点云中的每个点,找到最近邻的点(在第一个点云中),并将这两个点作为一条线绘制出来。这里使用了OpenGL的glVertex3f函数来定义每个点的坐标。整个循环遍历second_point_cloud中的所有点,从而绘制所有的最近邻点对的连线。如果nearest_neighbour_indexes中的索引是-1,则说明该点没有找到最近邻点,不会被绘制出来。

函数中涉及到一些额外的函数,如cudaCalculateGridParamscudaCalculateGridcudaNearestNeighborSearch,这些函数都是在CUDA平台上执行的,并负责在GPU上执行相应的操作。

bool nearestNeighbourhoodSearch(
    pcl::PointCloud<pcl::PointXYZ> &first_point_cloud,
    pcl::PointCloud<pcl::PointXYZ> &second_point_cloud,
    float search_radius,
    float bounding_box_extension,
    int max_number_considered_in_INNER_bucket,
    int max_number_considered_in_OUTER_bucket,
    std::vector<int> &nearest_neighbour_indexes)
{
    if (nearest_neighbour_indexes.size() != second_point_cloud.size()) // 如果最近邻索引的大小不等于第二个点云的大小
        return false;                                                   // 返回false

    cudaError_t err = ::cudaSuccess;
    err = cudaSetDevice(0); // 设置设备
    if (err != ::cudaSuccess)
        return false;

    std::cout << "Before cudaMalloc" << std::endl;
    coutMemoryStatus();

    gridParameters rgd_params;
    pcl::PointXYZ *d_first_point_cloud = NULL;
    pcl::PointXYZ *d_second_point_cloud = NULL;
    int *d_nearest_neighbour_indexes = NULL;
    hashElement *d_hashTable = NULL;
    bucket *d_buckets = NULL;

    int threads = getNumberOfAvailableThreads();
    std::cout << "CUDA code will use " << threads << " device threads" << std::endl;
    if (threads == 0)
        return false;

    err = cudaMalloc((void **)&d_first_point_cloud, first_point_cloud.points.size() * sizeof(pcl::PointXYZ)); // 为第一个点云分配内存
    if (err != ::cudaSuccess)
        return false;
    err = cudaMemcpy(d_first_point_cloud, first_point_cloud.points.data(), first_point_cloud.points.size() * sizeof(pcl::PointXYZ), cudaMemcpyHostToDevice); // 将第一个点云的数据复制到GPU中
    if (err != ::cudaSuccess)
        return false;

    err = cudaMalloc((void **)&d_second_point_cloud, second_point_cloud.points.size() * sizeof(pcl::PointXYZ)); // 为第二个点云分配内存
    if (err != ::cudaSuccess)
        return false;
    err = cudaMemcpy(d_second_point_cloud, second_point_cloud.points.data(), second_point_cloud.points.size() * sizeof(pcl::PointXYZ), cudaMemcpyHostToDevice); // 将第二个点云的数据复制到GPU中
    if (err != ::cudaSuccess)
        return false;

    err = cudaCalculateGridParams(d_first_point_cloud, first_point_cloud.points.size(),
                                  search_radius, search_radius, search_radius, bounding_box_extension, rgd_params); // 计算网格参数
    if (err != ::cudaSuccess)
        return false;
    std::cout << "regular grid parameters:" << std::endl;
    std::cout << "bounding_box_min_X: " << rgd_params.bounding_box_min_X << std::endl;
    std::cout << "bounding_box_min_Y: " << rgd_params.bounding_box_min_Y << std::endl;
    std::cout << "bounding_box_min_Z: " << rgd_params.bounding_box_min_Z << std::endl;
    std::cout << "bounding_box_max_X: " << rgd_params.bounding_box_max_X << std::endl;
    std::cout << "bounding_box_max_Y: " << rgd_params.bounding_box_max_Y << std::endl;
    std::cout << "bounding_box_max_Z: " << rgd_params.bounding_box_max_Z << std::endl;
    std::cout << "number_of_buckets_X: " << rgd_params.number_of_buckets_X << std::endl;
    std::cout << "number_of_buckets_Y: " << rgd_params.number_of_buckets_Y << std::endl;
    std::cout << "number_of_buckets_Z: " << rgd_params.number_of_buckets_Z << std::endl;
    std::cout << "resolution_X: " << rgd_params.resolution_X << std::endl;
    std::cout << "resolution_Y: " << rgd_params.resolution_Y << std::endl;
    std::cout << "resolution_Z: " << rgd_params.resolution_Z << std::endl;

    err = cudaMalloc((void **)&d_hashTable, first_point_cloud.points.size() * sizeof(hashElement));
    if (err != ::cudaSuccess)
        return false;

    err = cudaMalloc((void **)&d_buckets, rgd_params.number_of_buckets * sizeof(bucket));
    if (err != ::cudaSuccess)
        return false;

    err = cudaMalloc((void **)&d_nearest_neighbour_indexes, second_point_cloud.points.size() * sizeof(int));
    if (err != ::cudaSuccess)
        return false;

    std::cout << "After cudaMalloc" << std::endl;
    coutMemoryStatus();

    err = cudaCalculateGrid(threads, d_first_point_cloud, d_buckets, d_hashTable, first_point_cloud.points.size(), rgd_params); // 计算网格
    if (err != ::cudaSuccess)
        return false;

    err = cudaNearestNeighborSearch(
        threads,
        d_first_point_cloud,
        first_point_cloud.points.size(),
        d_second_point_cloud,
        second_point_cloud.points.size(),
        d_hashTable,
        d_buckets,
        rgd_params,
        search_radius,
        max_number_considered_in_INNER_bucket,
        max_number_considered_in_OUTER_bucket,
        d_nearest_neighbour_indexes); // 计算最近邻索引
    if (err != ::cudaSuccess)
        return false;

    err = cudaMemcpy(nearest_neighbour_indexes.data(), d_nearest_neighbour_indexes, second_point_cloud.points.size() * sizeof(int), cudaMemcpyDeviceToHost); // 将最近邻索引复制到CPU中
    if (err != ::cudaSuccess)
    {
        return false;
    }

    err = cudaFree(d_first_point_cloud);
    d_first_point_cloud = NULL;
    if (err != ::cudaSuccess)
        return false;

    err = cudaFree(d_second_point_cloud);
    d_second_point_cloud = NULL;
    if (err != ::cudaSuccess)
        return false;

    err = cudaFree(d_hashTable);
    d_hashTable = NULL;
    if (err != ::cudaSuccess)
        return false;

    err = cudaFree(d_buckets);
    d_buckets = NULL;
    if (err != ::cudaSuccess)
        return false;

    err = cudaFree(d_nearest_neighbour_indexes);
    d_nearest_neighbour_indexes = NULL;
    if (err != ::cudaSuccess)
        return false;

    std::cout << "After cudaFree" << std::endl;
    coutMemoryStatus();

    return true;
}

//然后可以尝试着opengl显示
for (size_t i = 0; i < second_point_cloud.size(); i++)
    {
        int index_nn = nearest_neighbour_indexes[i];
        if (index_nn != -1 && index_nn >= 0 && index_nn < first_point_cloud.size())
        {
            glVertex3f(second_point_cloud[i].x, second_point_cloud[i].y, second_point_cloud[i].z);
            glVertex3f(first_point_cloud[index_nn].x, first_point_cloud[index_nn].y, first_point_cloud[index_nn].z);
        }
    }

3. Thrust代码完成加速

这段代码是使用Thrust库实现的最近邻搜索的函数。该函数的输入是两个点云first_point_cloudsecond_point_cloud,以及一个搜索半径search_radius。输出是一个向量nearest_neighbour_indexes,其中包含了first_point_cloud中每个点的最近邻在second_point_cloud中的索引。此外,在is_nearest_neighbor函数中,使用了一个for循环来遍历second_point_cloud中的每个点,计算该点和查询点之间的距离,并找到最小距离的点。如果找到了最近邻,则将最近邻的索引赋值给nearest_neighbour_indexes向量中对应的位置。这里使用了一个bool变量found_nearest_neighbor来标记是否找到了最近邻,函数返回该变量的值。

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/tuple.h>
#include <thrust/inner_product.h>
#include <vector>

struct PointXYZ
{
    float x, y, z;
};

__host__ __device__ float distance_squared(const PointXYZ &a, const PointXYZ &b) // 计算两点之间的距离
{
    float dx = a.x - b.x;
    float dy = a.y - b.y;
    float dz = a.z - b.z;
    return dx * dx + dy * dy + dz * dz;
}

struct is_nearest_neighbor
{
    const thrust::device_vector<PointXYZ> &first_point_cloud;
    const thrust::device_vector<PointXYZ> &second_point_cloud;
    const float search_radius;

    is_nearest_neighbor(const thrust::device_vector<PointXYZ> &first_point_cloud,
                        const thrust::device_vector<PointXYZ> &second_point_cloud,
                        const float search_radius)
        : first_point_cloud(first_point_cloud),
          second_point_cloud(second_point_cloud),
          search_radius(search_radius) {}

    __host__ __device__ bool operator()(const thrust::tuple<PointXYZ, int> &t) const
    {
        const PointXYZ &query_point = thrust::get<0>(t); // 获取点云中的点
        const int query_index = thrust::get<1>(t);         // 获取点云中的点的索引
        float min_distance_squared = search_radius * search_radius + 1.0f;
        bool found_nearest_neighbor = false; // 是否找到最近邻

        for (int i = 0; i < second_point_cloud.size(); ++i) // 遍历点云
        {
            const PointXYZ &candidate_point = second_point_cloud[i];                         // 获取点云中的点
            const float distance_squared = ::distance_squared(query_point, candidate_point); // 计算两点之间的距离
            if (distance_squared < min_distance_squared)                                     // 如果距离小于最小距离
            {
                min_distance_squared = distance_squared;
                nearest_neighbour_indexes[query_index] = i; // 将最近邻的索引赋值给nearest_neighbour_indexes
                found_nearest_neighbor = true;                // 找到最近邻
            }
        }

        return found_nearest_neighbor;
    }
};

bool nearestNeighbourhoodSearch(const thrust::device_vector<PointXYZ> &first_point_cloud,
                                const thrust::device_vector<PointXYZ> &second_point_cloud,
                                const float search_radius,
                                std::vector<int> &nearest_neighbour_indexes)
{
    if (first_point_cloud.size() == 0 || second_point_cloud.size() == 0)
        return false;

    nearest_neighbour_indexes.resize(first_point_cloud.size(), -1); // 初始化nearest_neighbour_indexes
    thrust::counting_iterator<int> index_sequence_begin(0);
    thrust::counting_iterator<int> index_sequence_end = index_sequence_begin + first_point_cloud.size(); // 获取点云的索引

    is_nearest_neighbor predicate(first_point_cloud, second_point_cloud, search_radius);                                     // 将点云和索引组合
    thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(first_point_cloud.begin(), index_sequence_begin)), // 将点云和索引组合

bool any_found = thrust::make_zip_iterator(thrust::make_tuple(first_point_cloud.end(), index_sequence_end)),
                             predicate,
                             false,
                             thrust::logical_or<bool>());

    return any_found ;
}

4. 结果显示

蓝色和红色是两个点云,绿色是匹配的的链接