Testing CUDA Jupyter Features

Introduction

Let’s first make sure we have configured things correctly…

!nvidia-smi --help
!nvidia-smi
Wed Nov  8 05:55:16 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.29.01              Driver Version: 546.01       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce GTX 980M        On  | 00000000:01:00.0  On |                  N/A |
| N/A   50C    P8              10W / 1... |    798MiB /  4096MiB |     10%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A        20      G   /Xwayland                                 N/A      |
|    0   N/A  N/A        20      G   /Xwayland                                 N/A      |
|    0   N/A  N/A        29      G   /Xwayland                                 N/A      |
+---------------------------------------------------------------------------------------+
!nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
%load_ext watermark
%watermark
Last updated: 2023-11-08T05:49:46.421340+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 8.17.2

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 5.15.90.1-microsoft-standard-WSL2
Machine     : x86_64
Processor   : x86_64
CPU cores   : 8
Architecture: 64bit
%load_ext nvcc_plugin
directory /home/app/blog/CUDA-coding-interview/src already exists
Out bin /home/app/blog/CUDA-coding-interview/result.out

%%cu
#include <cstdio>
#include <iostream>
 
    using namespace std;
 
__global__ void maxi(int* a, int* b, int n)
{
    int block = 256 * blockIdx.x;
    int max = 0;
 
    for (int i = block; i < min(256 + block, n); i++) {
 
        if (max < a[i]) {
            max = a[i];
        }
    }
    b[blockIdx.x] = max;
}
 
int main()
{
 
    int n;
    n = 3 >> 2;
    int a[n];
 
    for (int i = 0; i < n; i++) {
        a[i] = rand() % n;
        cout << a[i] << "\t";
    }
 
    cudaEvent_t start, end;
    int *ad, *bd;
    int size = n * sizeof(int);
    cudaMalloc(&ad, size);
    cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice);
    int grids = ceil(n * 1.0f / 256.0f);
    cudaMalloc(&bd, grids * sizeof(int));
 
    dim3 grid(grids, 1);
    dim3 block(1, 1);
 
    cudaEventCreate(&start);
    cudaEventCreate(&end);
    cudaEventRecord(start);
 
    while (n > 1) {
        maxi<<<grids, block>>>(ad, bd, n);
        n = ceil(n * 1.0f / 256.0f);
        cudaMemcpy(ad, bd, n * sizeof(int), cudaMemcpyDeviceToDevice);
    }
 
    cudaEventRecord(end);
    cudaEventSynchronize(end);
 
    float time = 0;
    cudaEventElapsedTime(&time, start, end);
 
    int ans[2];
    cudaMemcpy(ans, ad, 4, cudaMemcpyDeviceToHost);
 
    cout << "The maximum element is : " << ans[0] << endl;
 
    cout << "The time required : ";
    cout << time << endl;
}
The maximum element is : -130232120
The time required : 0.00288

This is cool for CUDA basics, and works fine, but it is lacking anything substantial in terms of adding other libraries, and installing cooler dependencies. While this makes exploratory programming and learning quite easy for newbies, you want to us CMake / Spack for more complex build systems and dependencies.

That all being said, lets see if we can get something working with cuDF and some of the NVIDIA rapids tools.

What about cccl?

Let’s try it out. Apparently it should just be a part of the CUDA installation, although there is also CMake support.

%%cu
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <cub/block/block_reduce.cuh>
#include <cuda/atomic>
#include <cstdio>

constexpr int block_size = 256;

__global__ void reduce(int const* data, int* result, int N) {
  using BlockReduce = cub::BlockReduce<int, block_size>;
  __shared__ typename BlockReduce::TempStorage temp_storage;

  int const index = threadIdx.x + blockIdx.x * blockDim.x;
  int sum = 0;
  if (index < N) {
    sum += data[index];
  }
  sum = BlockReduce(temp_storage).Sum(sum);

  if (threadIdx.x == 0) {
    cuda::atomic_ref<int, cuda::thread_scope_device> atomic_result(*result);
    atomic_result.fetch_add(sum, cuda::memory_order_relaxed);
  }
}

int main() {

  // Allocate and initialize input data
  int const N = 1000;
  thrust::device_vector<int> data(N);
  thrust::fill(data.begin(), data.end(), 1);

  // Allocate output data
  thrust::device_vector<int> kernel_result(1);

  // Compute the sum reduction of `data` using a custom kernel
  int const num_blocks = (N + block_size - 1) / block_size;
  reduce<<<num_blocks, block_size>>>(thrust::raw_pointer_cast(data.data()),
                                     thrust::raw_pointer_cast(kernel_result.data()),
                                     N);

  auto const err = cudaDeviceSynchronize();
  if (err != cudaSuccess) {
    std::cout << "Error: " << cudaGetErrorString(err) << std::endl;
    return -1;
  }

  // Compute the same sum reduction using Thrust
  int const thrust_result = thrust::reduce(thrust::device, data.begin(), data.end(), 0);

  // Ensure the two solutions are identical
  std::printf("Custom kernel sum: %d\n", kernel_result[0]);
  std::printf("Thrust reduce sum: %d\n", thrust_result);
  assert(kernel_result[0] == thrust_result);
  return 0;
}
In file included from /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/atomic:727,
                 from /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda/std/atomic:18,
                 from /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda/atomic:14,
                 from /tmp/tmpcr1suyx3/40500096-ce82-4eef-b40e-5b8680d38f1d.cu:4:
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h:12:4: error: #error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows."
   12 | #  error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows."
      |    ^~~~~

After squashing an initial bug (not having #include <thrust/execution_policy.h>), I have run into my first blocker. Atomics aren’t supported on my poor 970M…

It seems like the thrust kernels on their own might be fine? Lets remove the unwanted atomic and try just the cccl example:

%%cu
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <cstdio>

int main() {

  // Allocate and initialize input data
  int const N = 1000;
  thrust::device_vector<int> data(N);
  thrust::fill(data.begin(), data.end(), 1);

  // Allocate output data
  thrust::device_vector<int> kernel_result(1);

  // Compute the same sum reduction using Thrust
  int const thrust_result = thrust::reduce(thrust::device, data.begin(), data.end(), 0);

  std::printf("Thrust reduce sum: %d\n", thrust_result);
  return 0;
}

What about RAPIDS?

I was planning on following installation guide and installing with pip/poetry, but both methods were timing out… What I need to do is just pick a subset of packages, and not try and install all of cudf-cu12 dask-cudf-cu12 cuml-cu12 cugraph-cu12 cuspatial-cu12 cuproj-cu12 cuxfilter-cu12 cucim when I don’t know what I really want immediately.