多个block, block 内线程同步,如何求总时间?

global static void sumOfSquares_mbmt_syn(int num, int result,
clock_t* time)
{
extern shared int shared;
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int i;
if(tid == 0) time[bid] = clock();
shared[tid] = 0;
for(i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
{
shared[tid] += num[i] * num[i];
}

__syncthreads();
if(tid == 0)
{
for(i = 1; i < THREAD_NUM; i++)
{
shared[0] += shared[i];
}
result[bid] = shared[0];
}

if(tid == 0) time[bid + BLOCK_NUM] = clock();
}

这是我的kernel 函数,
我在CPU端是这样调用的

sumOfSquares_mbmt_syn<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result, time);
clock_t time_used[BLOCK_NUM * 2];
cudaMemcpy(&time_used, time, sizeof(clock_t) * BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

我知道每个block 所花费的时间是 time_used[i + BLOCK_NUM] - time_used[i] ; (0 < i < BLOCK_NUM );
我现在的问题是GPU 上消耗的总时间该怎么求?
应该不是把每个块所花费的时间 简单地累加起来吧