cudaMemcpyBatchAsync在实现hosttodevice拷贝时可以调通,但是devicetohost一直报invalid argument错误,代码在一楼
cuda: 12.8.1
OS: ubuntu 22.04
gpu: Quadro P3200
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
cuda: 12.8.1
OS: ubuntu 22.04
gpu: Quadro P3200
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
include <cuda_runtime.h>
include
include
define CHECK(call)
do {
cudaError_t err = call;
if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at " << FILE << “:” << LINE << std::endl;
exit(EXIT_FAILURE);
}
} while(0)
int main() {
const size_t count = 3;
const size_t size = 1024;
float *h_ptrs[count]; // Host pointers
float *d_ptrs[count]; // Device pointers
size_t sizes[count];
for (int i = 0; i < count; ++i) {
CHECK(cudaMallocHost(&h_ptrs[i], size * sizeof(float)));
CHECK(cudaMalloc(&d_ptrs[i], size * sizeof(float)));
sizes[i] = size;
}
CHECK(cudaSetDevice(0));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
size_t attrsIdxs[2];
attrsIdxs[0] = 0;
attrsIdxs[1] = 2;
// 设置两个拷贝属性
cudaMemcpyAttributes attrs[2];
attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderAny;
attrs[0].srcLocHint.type = cudaMemLocationTypeDevice;
attrs[0].srcLocHint.id = 0;
attrs[0].dstLocHint.type = cudaMemLocationTypeHost;
attrs[0].dstLocHint.id = 0;
attrs[0].flags = cudaMemcpyDefault;
attrs[1].srcAccessOrder = cudaMemcpySrcAccessOrderAny;
attrs[1].srcLocHint.type = cudaMemLocationTypeDevice;
attrs[1].srcLocHint.id = 0;
attrs[1].dstLocHint.type = cudaMemLocationTypeHost;
attrs[1].dstLocHint.id = 0;
attrs[1].flags = cudaMemcpyDefault;
// 启动 batch async 拷贝
size_t failIdx;
**CHECK(cudaMemcpyBatchAsync((void **)h_ptrs, (void **)d_ptrs, sizes, count, attrs, attrsIdxs, 2, &failIdx, stream));**
// 等待完成
CHECK(cudaDeviceSynchronize());
std::cout << "Batch copy completed." << std::endl;
// 清理
for (int i = 0; i < count; ++i) {
cudaFree(d_ptrs[i]);
cudaFreeHost(h_ptrs[i]);
}
return 0;
}