请问各位前辈关于pycuda使用多线程调用内核问题

from pycuda import tools
import pycuda.driver as drv
import numpy as np
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from pycuda.compiler import SourceModule

mod = SourceModule(

   '''
    __global__ void test(int a ){
        printf("%d\\n" ,a);
        printf("hello world\\n");
        return;
}
    '''
)

def process(i):
   # print i

    str = drv.Stream()
   test1 = mod.get_function("test")
   test1(np.uint32(i), block=(2, 1, 1), grid=(1, 1) , stream = str)


if __name__ == '__main__':

   executor = ThreadPoolExecutor(4)
   for i in range(10):
   executor.submit(process, i)
   executor.shutdown()

请教各位前辈,我想在使用pycuda,在多线程里面调用cuda内核,一直不能成功,想问一下这个问题的话我该如何解决