Parallel implementation of the computation of the sum of contiguous subsequences in an array using Cuda

Question

Welcome To Ask or Share your Answers For Others

Parallel implementation of the computation of the sum of contiguous subsequences in an array using Cuda

posted Jan 31, 2022 in Technique[技术] by 深蓝 (71.8m points)

Parallel implementation of the computation of the sum of contiguous subsequences in an array using Cuda

lets consider the following array: tab = [80,12,14,5,70,9,26,30,8,12,16,15] I want to compute the sum of all possible sequences of size 4 using cuda : for example :

S1=80+12+14+5=111
S2=12+14+5+70 =101
S3=14+5+70+9 =98
....

You have an efficient idea to parallise this task using Cuda. the previous table is just an example in my case i will use huge one.

See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙；凝视深渊过久,深渊将回以凝视…

1 Reply

深蓝 · Answer 1 · 2022-01-31T07:23:32+0000

We can do this in a single operation (thrust::transform) using thrust. In CUDA, this can be considered to be a fairly simple 1-D stencil operation.

A good description of a 1-D stencil operation can be found here on slides 49-58.

This is actually a simplified case, since the stencil width is 4 and it is only on one "side" of the center point.

Here's a worked example comparing the 2 approaches:

$ cat t88.cu
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/copy.h>
#include <iostream>

const int nTPB=256;
typedef float mytype;
const int ds = 1048576*32;

struct sum4
{
  template <typename T>
  __host__ __device__
  mytype operator()(const T t){
    return thrust::get<0>(t) + thrust::get<1>(t) + thrust::get<2>(t) + thrust::get<3>(t);
  }
};

template <typename T>
__global__ void sum4kernel(const T * __restrict__ in, T * __restrict__ out, const unsigned dsize)
{

  __shared__ T sdata[nTPB+3];
  unsigned idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < dsize) sdata[threadIdx.x] = in[idx];
  if ((threadIdx.x < 3) && ((idx+blockDim.x) < dsize)) sdata[threadIdx.x + blockDim.x] = in[idx + blockDim.x];
  __syncthreads();
  T temp = sdata[threadIdx.x];
  temp += sdata[threadIdx.x+1];
  temp += sdata[threadIdx.x+2];
  temp += sdata[threadIdx.x+3];
  if (idx < dsize - 4) out[idx] = temp;
}

int main(){

  mytype hdata1[] = {80,12,14,5,70,9,26,30,8,12,16,15};
  unsigned ds1 = sizeof(hdata1)/sizeof(hdata1[0]);
  mytype hres1[ds1-4];
  thrust::device_vector<mytype> ddata1(hdata1, hdata1+ds1);
  thrust::device_vector<mytype> dres1(ds1-4);
  thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(ddata1.begin(), ddata1.begin()+1, ddata1.begin()+2, ddata1.begin()+3)), thrust::make_zip_iterator(thrust::make_tuple(ddata1.end()-3, ddata1.end()-2, ddata1.end()-1, ddata1.end())), dres1.begin(), sum4());
  thrust::copy(dres1.begin(), dres1.end(), std::ostream_iterator<mytype>(std::cout, ","));
  std::cout << std::endl;
  sum4kernel<<<(ds1+nTPB-1)/nTPB, nTPB>>>(thrust::raw_pointer_cast(ddata1.data()), thrust::raw_pointer_cast(dres1.data()), ds1);
  cudaMemcpy(hres1, thrust::raw_pointer_cast(dres1.data()), (ds1-4)*sizeof(mytype), cudaMemcpyDeviceToHost);
  for (int i = 0; i < ds1-4; i++)
    std::cout << hres1[i] << ",";
  std::cout << std::endl;

  thrust::device_vector<mytype> ddata2(ds, 1);
  thrust::device_vector<mytype> dres2(ds-4);

  cudaEvent_t start, stop;
  cudaEventCreate(&start); cudaEventCreate(&stop);

  cudaEventRecord(start);
  thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(ddata2.begin(), ddata2.begin()+1, ddata2.begin()+2, ddata2.begin()+3)), thrust::make_zip_iterator(thrust::make_tuple(ddata2.end()-3, ddata2.end()-2, ddata2.end()-1, ddata2.end())), dres2.begin(), sum4());
  cudaEventRecord(stop);
  thrust::host_vector<mytype> hres2 = dres2;
  float et;
  cudaEventElapsedTime(&et, start, stop);
  std::cout << "thrust time: " << et << "ms" << std::endl;
// validate
  for (int i = 0; i < ds-4; i++) if (hres2[i] != 4) {std::cout << "thrust validation failure: " << i << "," << hres2[i] << std::endl; return 1;}
  cudaEventRecord(start);
  sum4kernel<<<(ds+nTPB-1)/nTPB, nTPB>>>(thrust::raw_pointer_cast(ddata2.data()), thrust::raw_pointer_cast(dres2.data()), ds);
  cudaEventRecord(stop);
  cudaMemcpy(&(hres2[0]), thrust::raw_pointer_cast(dres2.data()), (ds-4)*sizeof(mytype), cudaMemcpyDeviceToHost);
  cudaEventElapsedTime(&et, start, stop);
  std::cout << "cuda time: " << et << "ms" << std::endl;
  for (int i = 0; i < ds-4; i++) if (hres2[i] != 4) {std::cout << "cuda validation failure: " << i << "," << hres2[i] << std::endl; return 1;}
}


$ nvcc -arch=sm_61 -o t88 t88.cu
$ ./t88
111,101,98,110,135,73,76,66,
111,101,98,110,135,73,76,66,
thrust time: 0.902464ms
cuda time: 0.76288ms
$

For this particular GPU (Titan X Pascal) there is not much difference (~15%) between the thrust time for a 32M element data set and the CUDA time. We would expect this algorithm to be memory-bound.

For this pascal titan x, bandwidthTest reports about 345 GB/s of measureable memory bandwidth.

The CUDA implementation must load the entire data set size and store the entire data set size (approximately) = 2 operations per element, so the achieved bandwidth calculation for this CUDA code is:

(32*1048576 elements * 2 ops/element * 4 bytes/op) / 0.00076288 s = ~350GB/s

So it would appear that the CUDA implementation is achieving approximately the maximum available bandwidth.

Categories

Parallel implementation of the computation of the sum of contiguous subsequences in an array using Cuda

Parallel implementation of the computation of the sum of contiguous subsequences in an array using Cuda

Please log in or register to add a comment.

Please log in or register to reply this article.

1 Reply

Please log in or register to add a comment.

Just Browsing Browsing

Most popular tags