-
Notifications
You must be signed in to change notification settings - Fork 1
/
getPredictedSignal.cu
123 lines (101 loc) · 4.15 KB
/
getPredictedSignal.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* getPredictedSignal.cu
Moises Hernandez-Fernandez - FMRIB Image Analysis Group
Copyright (C) 2005 University of Oxford */
// Method to calculate the model predicted signal on the GPU of a group of voxels given the model parameters.
#include "getPredictedSignal.h"
#include "functions_gpu.h"
#include "modelparameters.h"
#include "macro_numerical.h"
#include "modelfunctions.h"
using namespace std;
namespace Cudimot{
#define VOXELS_BLOCK 8
#define THREADS_VOXEL 32 // Multiple of 32: Threads collaborating to compute a voxel. Do not change this, otherwise Synchronization will be needed and shuffles cannot be used
template <typename T>
__global__ void getPredictedSignal_kernel(
int nvox, // nvoxels
int nmeas, // nmeasurements
int nsamples,
int CFP_Tsize, //size*M-measurements
int FixP_Tsize, // fixed params: size*N-voxels
T* samples, // samples of estimated parameters
T* CFP_global, // common fixed model parameters
T* FixP, // fixed model parameters
T* PredictedSignal)
{
// 1 block of threads process several voxels
// Each warp processes 1 voxel
int idVOX= (blockIdx.x*VOXELS_BLOCK)+int(threadIdx.x/THREADS_VOXEL);
int idVOX_inBlock = threadIdx.x/THREADS_VOXEL;
int idSubVOX= threadIdx.x%THREADS_VOXEL;
bool leader = (idSubVOX==0); // Some steps are performed by only one thread of the warp
////////// DYNAMIC SHARED MEMORY ///////////
extern __shared__ double shared[]; //Size:
T* CFP = (T*)shared; //nmeas*CMP_Tsize
T* meanSamples = (T*) &CFP[nmeas*CFP_Tsize];//NPARAMS*VOXELS_BLOCK
////////////////////////////////////////////
/// Copy common fixed model parameters to Shared Memory ///
if(threadIdx.x==0){ // only one thread of the whole block. Common to all voxels
for(int i=0;i<nmeas*CFP_Tsize;i++){
CFP[i]=CFP_global[i];
}
}
__syncthreads();
///////////////////////////////////////////////////////////
///////// each voxel/warp of the block points to its data///////////
samples = &samples[idVOX*NPARAMS*nsamples]; // Global
meanSamples = &meanSamples[idVOX_inBlock*NPARAMS];
PredictedSignal = &PredictedSignal[idVOX*nmeas]; //Global
FixP = &FixP[idVOX*FixP_Tsize]; // Global memory
////////////////////////////////////////////////////////////////////
/// Ititialise shared values of each voxel: only the leader///
if(leader){
if(nsamples>1){
for(int par=0;par<NPARAMS;par++){
T value=0;
for(int samp=0;samp<nsamples;samp++){
value+= samples[par*nsamples+samp];
}
meanSamples[par]=value/nsamples;
}
}else{
#pragma unroll
for(int par=0;par<NPARAMS;par++){
meanSamples[par]=samples[par];
}
}
}
__syncthreads();
int idMeasurement=idSubVOX;
int nmeas2compute = nmeas/THREADS_VOXEL;
if (idSubVOX<(nmeas%THREADS_VOXEL)) nmeas2compute++;
for(int iter=0;iter<nmeas2compute;iter++){
T* myCFP = &CFP[idMeasurement*CFP_Tsize];
T pred=Predicted_Signal(NPARAMS,meanSamples,myCFP,FixP);
PredictedSignal[idMeasurement]=pred;
idMeasurement+=THREADS_VOXEL;
}
}
template <typename T>
getPredictedSignal<T>::getPredictedSignal(){}
template <typename T>
void getPredictedSignal<T>::run(
int nvox, int nmeas, int nsamples,
int CFP_size, int FixP_size,
T* samples, T* CFP, T* FixP,
T* PredictedSignal)
{
long int amount_shared_mem = 0;
amount_shared_mem += (nmeas*CFP_size)*sizeof(T); // CFP
amount_shared_mem += (NPARAMS*VOXELS_BLOCK)*sizeof(T); //mean_samples
cout << "Shared Memory used in PredictedSignal kernel: " << amount_shared_mem << endl;
int threads_block = VOXELS_BLOCK * THREADS_VOXEL;
int nblocks=(nvox/VOXELS_BLOCK);
if(nvox%VOXELS_BLOCK) nblocks++;
getPredictedSignal_kernel<T><<<nblocks,threads_block,amount_shared_mem>>>(nvox,nmeas,nsamples,CFP_size,FixP_size,samples,CFP,FixP,PredictedSignal);
sync_check("getPredictedSignal Kernel");
}
// Explicit Instantiations of the template
template class getPredictedSignal<float>;
template class getPredictedSignal<double>;
}