In this example, we will learn how to implement a vector addition kernel in hidet script.
import hidet
def vector_addition(n):
# the hidet.lang module is the submodule that implements hidet script
from hidet.lang import attr, f32
# import cuda specific extern variables
from hidet.lang.cuda import threadIdx, blockIdx, blockDim
from hidet.transforms.tools import add_packed_func
with hidet.script_module() as script_module:
@hidet.script
def kernel(a: f32[n], b: f32[n], c: f32[n]):
# mark this function as a cuda kernel
attr.func_kind = 'cuda_kernel'
# set the block dimension and grid dimensions
attr.cuda_block_dim = 256
attr.cuda_grid_dim = (n + 255) / 256
# get the index of the thread among all threads
idx = threadIdx.x + blockIdx.x * blockDim.x
if idx < n:
c[idx] = a[idx] + b[idx]
ir_module = script_module.ir_module()
# because we can not run the cuda kernel, we creates a packed function to launch it
add_packed_func(ir_module, func=kernel, pack_func_name='add')
return hidet.driver.build_ir_module(ir_module, func_name='add')
n = 5
add_func = vector_addition(n)
print(add_func.source(color=True))
#include <stdint.h> #include <cuda_fp16.h> #include <cuda_bf16.h> #include <hidet/runtime/cuda_context.h> #include <hidet/runtime/cpu_context.h> typedef float tfloat32_t; #define __float_to_tf32(x) (x) extern "C" { __global__ void __launch_bounds__(256) hidet_kernel(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) { if ((int)threadIdx.x < 5) { c[(int)threadIdx.x] = (a[(int)threadIdx.x] + b[(int)threadIdx.x]); } } __host__ void hidet_add(int32_t num_args, int32_t * __restrict__ arg_types, void* * __restrict__ args) { assert(((void)"Expect 3 arguments", (num_args == 3))); assert(((void)"The 0-th argument should be TensorPointerType(tensor(float32, [5]))", (arg_types[0] == 3))); assert(((void)"The 1-th argument should be TensorPointerType(tensor(float32, [5]))", (arg_types[1] == 3))); assert(((void)"The 2-th argument should be TensorPointerType(tensor(float32, [5]))", (arg_types[2] == 3))); hidet_kernel<<<dim3(1, 1, 1), dim3(256, 1, 1), 0, (cudaStream_t)get_cuda_stream()>>>(((float*)(args[0])), ((float*)(args[1])), ((float*)(args[2]))); } }
a = hidet.randint(low=0, high=3, shape=[n]).to('float32').cuda()
b = hidet.randint(low=0, high=3, shape=[n]).to('float32').cuda()
c = hidet.randn([n]).cuda()
print(a)
print(b)
add_func(a, b, c)
print(c)
Tensor(shape=(5,), dtype='float32', device='cuda:0') [2. 0. 2. 0. 1.] Tensor(shape=(5,), dtype='float32', device='cuda:0') [0. 1. 1. 2. 2.] Tensor(shape=(5,), dtype='float32', device='cuda:0') [2. 1. 3. 2. 3.]