cudaMemcpyBatchAsync有没有实例,老是invalid argument错误

cudaMemcpyBatchAsync在实现hosttodevice拷贝时可以调通,但是devicetohost一直报invalid argument错误,代码在一楼

cuda: 12.8.1
OS: ubuntu 22.04
gpu: Quadro P3200
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

include <cuda_runtime.h>
include
include

define CHECK(call)
do {
cudaError_t err = call;
if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at " << FILE << “:” << LINE << std::endl;
exit(EXIT_FAILURE);
}
} while(0)

int main() {
const size_t count = 3;
const size_t size = 1024;

float *h_ptrs[count]; // Host pointers
float *d_ptrs[count]; // Device pointers
size_t sizes[count];
for (int i = 0; i < count; ++i) {
    CHECK(cudaMallocHost(&h_ptrs[i], size * sizeof(float)));
    CHECK(cudaMalloc(&d_ptrs[i], size * sizeof(float)));    
    sizes[i] = size;
}

CHECK(cudaSetDevice(0));

cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));

size_t attrsIdxs[2];
attrsIdxs[0] = 0;
attrsIdxs[1] = 2;

// 设置两个拷贝属性
cudaMemcpyAttributes attrs[2];

attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderAny;
attrs[0].srcLocHint.type = cudaMemLocationTypeDevice;
attrs[0].srcLocHint.id = 0;
attrs[0].dstLocHint.type = cudaMemLocationTypeHost;
attrs[0].dstLocHint.id = 0;
attrs[0].flags = cudaMemcpyDefault;

attrs[1].srcAccessOrder = cudaMemcpySrcAccessOrderAny;
attrs[1].srcLocHint.type = cudaMemLocationTypeDevice;
attrs[1].srcLocHint.id = 0;
attrs[1].dstLocHint.type = cudaMemLocationTypeHost;
attrs[1].dstLocHint.id = 0;
attrs[1].flags = cudaMemcpyDefault;

// 启动 batch async 拷贝
size_t failIdx;
**CHECK(cudaMemcpyBatchAsync((void **)h_ptrs, (void **)d_ptrs, sizes, count, attrs, attrsIdxs, 2, &failIdx, stream));**

// 等待完成
CHECK(cudaDeviceSynchronize());

std::cout << "Batch copy completed." << std::endl;

// 清理
for (int i = 0; i < count; ++i) {
    cudaFree(d_ptrs[i]);
    cudaFreeHost(h_ptrs[i]);
}

return 0;

}