!nvidia-smi --help
Testing CUDA Jupyter Features
Introduction
Let’s first make sure we have configured things correctly…
!nvidia-smi
Wed Nov 8 05:55:16 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.29.01 Driver Version: 546.01 CUDA Version: 12.3 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce GTX 980M On | 00000000:01:00.0 On | N/A |
| N/A 50C P8 10W / 1... | 798MiB / 4096MiB | 10% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 20 G /Xwayland N/A |
| 0 N/A N/A 20 G /Xwayland N/A |
| 0 N/A N/A 29 G /Xwayland N/A |
+---------------------------------------------------------------------------------------+
!nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
%load_ext watermark
%watermark
Last updated: 2023-11-08T05:49:46.421340+00:00
Python implementation: CPython
Python version : 3.10.12
IPython version : 8.17.2
Compiler : GCC 11.4.0
OS : Linux
Release : 5.15.90.1-microsoft-standard-WSL2
Machine : x86_64
Processor : x86_64
CPU cores : 8
Architecture: 64bit
%load_ext nvcc_plugin
directory /home/app/blog/CUDA-coding-interview/src already exists
Out bin /home/app/blog/CUDA-coding-interview/result.out
%%cu
#include <cstdio>
#include <iostream>
;
using namespace std
int* a, int* b, int n)
__global__ void maxi(
{int block = 256 * blockIdx.x;
int max = 0;
for (int i = block; i < min(256 + block, n); i++) {
if (max < a[i]) {
max = a[i];
}
}= max;
b[blockIdx.x]
}
int main()
{
int n;
= 3 >> 2;
n int a[n];
for (int i = 0; i < n; i++) {
= rand() % n;
a[i] << a[i] << "\t";
cout
}
;
cudaEvent_t start, endint *ad, *bd;
int size = n * sizeof(int);
&ad, size);
cudaMalloc(;
cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice)int grids = ceil(n * 1.0f / 256.0f);
&bd, grids * sizeof(int));
cudaMalloc(
1);
dim3 grid(grids, 1, 1);
dim3 block(
&start);
cudaEventCreate(&end);
cudaEventCreate(;
cudaEventRecord(start)
while (n > 1) {
<<<grids, block>>>(ad, bd, n);
maxi= ceil(n * 1.0f / 256.0f);
n * sizeof(int), cudaMemcpyDeviceToDevice);
cudaMemcpy(ad, bd, n
}
;
cudaEventRecord(end);
cudaEventSynchronize(end)
float time = 0;
&time, start, end);
cudaEventElapsedTime(
int ans[2];
4, cudaMemcpyDeviceToHost);
cudaMemcpy(ans, ad,
<< "The maximum element is : " << ans[0] << endl;
cout
<< "The time required : ";
cout << time << endl;
cout }
The maximum element is : -130232120
The time required : 0.00288
This is cool for CUDA basics, and works fine, but it is lacking anything substantial in terms of adding other libraries, and installing cooler dependencies. While this makes exploratory programming and learning quite easy for newbies, you want to us CMake / Spack for more complex build systems and dependencies.
That all being said, lets see if we can get something working with cuDF and some of the NVIDIA rapids tools.
What about cccl?
Let’s try it out. Apparently it should just be a part of the CUDA installation, although there is also CMake support.
%%cu
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <cub/block/block_reduce.cuh>
#include <cuda/atomic>
#include <cstdio>
int block_size = 256;
constexpr
reduce(int const* data, int* result, int N) {
__global__ void = cub::BlockReduce<int, block_size>;
using BlockReduce ;
__shared__ typename BlockReduce::TempStorage temp_storage
int const index = threadIdx.x + blockIdx.x * blockDim.x;
int sum = 0;
if (index < N) {
sum += data[index];
}sum = BlockReduce(temp_storage).Sum(sum);
if (threadIdx.x == 0) {
<int, cuda::thread_scope_device> atomic_result(*result);
cuda::atomic_refsum, cuda::memory_order_relaxed);
atomic_result.fetch_add(
}
}
int main() {
// Allocate and initialize input data
int const N = 1000;
<int> data(N);
thrust::device_vector1);
thrust::fill(data.begin(), data.end(),
// Allocate output data
<int> kernel_result(1);
thrust::device_vector
// Compute the sum reduction of `data` using a custom kernel
int const num_blocks = (N + block_size - 1) / block_size;
reduce<<<num_blocks, block_size>>>(thrust::raw_pointer_cast(data.data()),
thrust::raw_pointer_cast(kernel_result.data()),;
N)
= cudaDeviceSynchronize();
auto const err if (err != cudaSuccess) {
<< "Error: " << cudaGetErrorString(err) << std::endl;
std::cout return -1;
}
// Compute the same sum reduction using Thrust
int const thrust_result = thrust::reduce(thrust::device, data.begin(), data.end(), 0);
// Ensure the two solutions are identical
"Custom kernel sum: %d\n", kernel_result[0]);
std::printf("Thrust reduce sum: %d\n", thrust_result);
std::printf(assert(kernel_result[0] == thrust_result);
return 0;
}
In file included from /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/atomic:727,
from /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda/std/atomic:18,
from /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda/atomic:14,
from /tmp/tmpcr1suyx3/40500096-ce82-4eef-b40e-5b8680d38f1d.cu:4:
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h:12:4: error: #error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows."
12 | # error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows."
| ^~~~~
After squashing an initial bug (not having #include <thrust/execution_policy.h>
), I have run into my first blocker. Atomics aren’t supported on my poor 970M…
It seems like the thrust kernels on their own might be fine? Lets remove the unwanted atomic and try just the cccl example:
%%cu
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <cstdio>
int main() {
// Allocate and initialize input data
int const N = 1000;
<int> data(N);
thrust::device_vector1);
thrust::fill(data.begin(), data.end(),
// Allocate output data
<int> kernel_result(1);
thrust::device_vector
// Compute the same sum reduction using Thrust
int const thrust_result = thrust::reduce(thrust::device, data.begin(), data.end(), 0);
"Thrust reduce sum: %d\n", thrust_result);
std::printf(return 0;
}
What about RAPIDS?
I was planning on following installation guide and installing with pip/poetry, but both methods were timing out… What I need to do is just pick a subset of packages, and not try and install all of cudf-cu12 dask-cudf-cu12 cuml-cu12 cugraph-cu12 cuspatial-cu12 cuproj-cu12 cuxfilter-cu12 cucim
when I don’t know what I really want immediately.