1 Star 0 Fork 0

fmscole/how-to-optimize-gemm

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
MMult_1x4_8.c 2.71 KB
一键复制 编辑 原始数据 按行查看 历史
/* Create macros so that the matrices are stored in column-major order */
#define A(i,j) a[ (j)*lda + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]
/* Routine for computing C = A * B + C */
void AddDot1x4( int, double *, int, double *, int, double *, int )
void MY_MMult( int m, int n, int k, double *a, int lda,
double *b, int ldb,
double *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */
/* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
one routine (four inner products) */
AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
void AddDot1x4( int k, double *a, int lda, double *b, int ldb, double *c, int ldc )
{
/* So, this routine computes four elements of C:
C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).
Notice that this routine is called with c = C( i, j ) in the
previous routine, so these are actually the elements
C( i, j ), C( i, j+1 ), C( i, j+2 ), C( i, j+3 )
in the original matrix C.
We now unroll the loop */
int p;
register double
/* hold contributions to
C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ) */
c_00_reg, c_01_reg, c_02_reg, c_03_reg,
/* holds A( 0, p ) */
a_0p_reg;
double
/* Point to the current elements in the four columns of B */
*bp0_pntr, *bp1_pntr, *bp2_pntr, *bp3_pntr;
bp0_pntr = &B( 0, 0 );
bp1_pntr = &B( 0, 1 );
bp2_pntr = &B( 0, 2 );
bp3_pntr = &B( 0, 3 );
c_00_reg = 0.0;
c_01_reg = 0.0;
c_02_reg = 0.0;
c_03_reg = 0.0;
for ( p=0; p<k; p+=4 ){
a_0p_reg = A( 0, p );
c_00_reg += a_0p_reg * *bp0_pntr++;
c_01_reg += a_0p_reg * *bp1_pntr++;
c_02_reg += a_0p_reg * *bp2_pntr++;
c_03_reg += a_0p_reg * *bp3_pntr++;
a_0p_reg = A( 0, p+1 );
c_00_reg += a_0p_reg * *bp0_pntr++;
c_01_reg += a_0p_reg * *bp1_pntr++;
c_02_reg += a_0p_reg * *bp2_pntr++;
c_03_reg += a_0p_reg * *bp3_pntr++;
a_0p_reg = A( 0, p+2 );
c_00_reg += a_0p_reg * *bp0_pntr++;
c_01_reg += a_0p_reg * *bp1_pntr++;
c_02_reg += a_0p_reg * *bp2_pntr++;
c_03_reg += a_0p_reg * *bp3_pntr++;
a_0p_reg = A( 0, p+3 );
c_00_reg += a_0p_reg * *bp0_pntr++;
c_01_reg += a_0p_reg * *bp1_pntr++;
c_02_reg += a_0p_reg * *bp2_pntr++;
c_03_reg += a_0p_reg * *bp3_pntr++;
}
C( 0, 0 ) += c_00_reg;
C( 0, 1 ) += c_01_reg;
C( 0, 2 ) += c_02_reg;
C( 0, 3 ) += c_03_reg;
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
C
1
https://gitee.com/fmscole/how-to-optimize-gemm.git
git@gitee.com:fmscole/how-to-optimize-gemm.git
fmscole
how-to-optimize-gemm
how-to-optimize-gemm
master

搜索帮助