Question: ? _ g l o b a l _ _ v o i d m y s g e m m ( b o o

?_global__voidmysgemm(booltransa,booltransb,intm,intn,intk,constfloat*A,constfloat*B,float*C){
/********************************************************************
*
*ComputeC=AxB
*whereAisa(mxk)matrix
*whereBisa(kxn)matrix
*whereCisa(mxn)matrix
*
*Usesharedmemoryfortiling
*
********************************************************************/
//INSERTKERNELCODEHERE
__shared__floatA_s[TILE_SIZE][TILE_SIZE];
__shared__floatB_s[TILE_SIZE][TILE_SIZE];
unsignedintrow=blockIdx.y*blockDim.y+threadIdx.y;
unsignedintcol=blockIdx.x*blockDim.x+threadIdx.x;
floatsum=0.0f;
unsignedintnum_tile=(k+TILE_SIZE-1)/TILE_SIZE;
for(unsignedinttile=0;tile<num_tile;++tile){
//loadtiletosharedmemory
if(transa){
if((row<m)&&(tile*TILE_SIZE+threadIdx.x<k)){//boundaryconditionforinputmatrixA
A_s[threadIdx.y][threadIdx.x]=A[(tile*TILE_SIZE+threadIdx.x)*m+row];
}
}else{
if((row<m)&&(tile*TILE_SIZE+threadIdx.x<k)){//boundaryconditionforinputmatrixA
A_s[threadIdx.y][threadIdx.x]=A[row*k+tile*TILE_SIZE+threadIdx.x];
}
}
if(transb){
if((col<n)&&(tile*TILE_SIZE+threadIdx.y<k)){
B_s[threadIdx.y][threadIdx.x]=B[col*k+tile*TILE_SIZE+threadIdx.y];
}
}else{
if((col<n)&&(tile*TILE_SIZE+threadIdx.y<k)){//boundaryconditionforinputmatrixB
B_s[threadIdx.y][threadIdx.x]=B[(tile*TILE_SIZE+threadIdx.y)*n+col];
}
}
__syncthreads();
//tilecomputation
intlimit=TILE_SIZE;
if(tile==num_tile-1){//tohandlewhenthetilevariableindicatesthelasttileofinput
limit=k-tile*TILE_SIZE;
}
if(row<m&&col<n){//boundaryconditionforoutputmatrixC
for(unsignedinti=0;i<limit;++i){
sum+=A_s[threadIdx.y][i]*B_s[i][threadIdx.x];
}
}
__syncthreads();
}
if(row

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Programming Questions!