Vector Addition¶

In this example, we will learn how to implement a vector addition kernel in hidet script.

In [3]:
import hidet

def vector_addition(n):
    # the hidet.lang module is the submodule that implements hidet script
    from hidet.lang import attr, f32
    
    # import cuda specific extern variables
    from hidet.lang.cuda import threadIdx, blockIdx, blockDim
    
    from hidet.transforms.tools import add_packed_func

    with hidet.script_module() as script_module:

        @hidet.script
        def kernel(a: f32[n], b: f32[n], c: f32[n]):
            
            # mark this function as a cuda kernel
            attr.func_kind = 'cuda_kernel'
            
            # set the block dimension and grid dimensions
            attr.cuda_block_dim = 256
            attr.cuda_grid_dim = (n + 255) / 256
            
            # get the index of the thread among all threads
            idx = threadIdx.x + blockIdx.x * blockDim.x
            
            if idx < n:
                c[idx] = a[idx] + b[idx]

    ir_module = script_module.ir_module()
    
    # because we can not run the cuda kernel, we creates a packed function to launch it
    add_packed_func(ir_module, func=kernel, pack_func_name='add')
    
    return hidet.driver.build_ir_module(ir_module, func_name='add')

n = 5
add_func = vector_addition(n)
print(add_func.source(color=True))
#include <stdint.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include <hidet/runtime/cuda_context.h>
#include <hidet/runtime/cpu_context.h>
typedef float tfloat32_t;
#define __float_to_tf32(x) (x)
extern "C" {

__global__ void __launch_bounds__(256) hidet_kernel(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
  if ((int)threadIdx.x < 5) {
    c[(int)threadIdx.x] = (a[(int)threadIdx.x] + b[(int)threadIdx.x]);
  } 
}

__host__ void hidet_add(int32_t num_args, int32_t * __restrict__ arg_types, void* * __restrict__ args) {
  assert(((void)"Expect 3 arguments", (num_args == 3)));
  assert(((void)"The 0-th argument should be TensorPointerType(tensor(float32, [5]))", (arg_types[0] == 3)));
  assert(((void)"The 1-th argument should be TensorPointerType(tensor(float32, [5]))", (arg_types[1] == 3)));
  assert(((void)"The 2-th argument should be TensorPointerType(tensor(float32, [5]))", (arg_types[2] == 3)));
  hidet_kernel<<<dim3(1, 1, 1), dim3(256, 1, 1), 0, (cudaStream_t)get_cuda_stream()>>>(((float*)(args[0])), ((float*)(args[1])), ((float*)(args[2])));
}

}

In [2]:
a = hidet.randint(low=0, high=3, shape=[n]).to('float32').cuda()
b = hidet.randint(low=0, high=3, shape=[n]).to('float32').cuda()
c = hidet.randn([n]).cuda()
print(a)
print(b)
add_func(a, b, c)
print(c)
Tensor(shape=(5,), dtype='float32', device='cuda:0')
[2. 0. 2. 0. 1.]
Tensor(shape=(5,), dtype='float32', device='cuda:0')
[0. 1. 1. 2. 2.]
Tensor(shape=(5,), dtype='float32', device='cuda:0')
[2. 1. 3. 2. 3.]