The KAP optimizing preprocessor enhances performance on machines
with cache memory using a combination of memory padding, loop
blocking, loop interchanging, and outer loop unrolling to optimize
reuse of operands in cache memory. Memory management in KAP is
enabled when /scalaropt=3
and /roundoff=3
are set.
When arrays are being processed in loops, KAP improves memory
access patterns by working on small sections of the arrays that
fit in the cache and give large cache hit ratios. You control the
sizes of these sections by using the memory management command
qualifiers /cacheline
, /cachesize
, /fpregisters
, /dpregisters
, and
/setassociativity
. The default settings for these
are the standard machine characteristics for which the code is
being targeted. These settings will give the best results for
most programs. However, for some specialized cases, you may want
to modify these settings to adjust the sizes of the array sections
that are meant to reside in the cache.
KAP determines at run time which of two memory management
algorithms to use, depending on the settings for OpenVMS Alpha
/cacheline
and /setassociativity
.
One algorithm, the KAP default, keeps square blocks of data in
the cache, while the other keeps long, narrow, blocks of data in
the cache. The following example shows how loop blocking, loop
interchanging, and loop unrolling can be used to improve the
performance of this matrix multiplication code. The KAP command
qualifiers used were /optimize=3, /scalaropt=3, /roundoff=3,
/addressresolution=2,
and /arclimit=2
. The
/arclimit=2
qualifier was used, because the arrays are
function arguments and KAP, by default, assumes they might overlap.
double a[200][200], b[200][200], c[200][200]; { int i,j,k; for (i=0;i<n;i++) for (j=0;j<n;j++) { a[i][j]=0.0; for (k=0;k<n;k++) a[i][j]=a[i][j]+b[i][k]*c[k][j]; } return (a[3][5]); }
Becomes:
double matm( n, a, b, c ) int n; double (*a)[200]; double (*b)[200]; double (*c)[200]; { int i; int j; int k; int _Kii1; int _Kii4; int _Kii5; int _Kii11; int _Kii12; int _Kii18; int _Kii19; int _Kii31; double _Kdd17; double _Kdd18; double _Kdd19; int _Kii39; int _Kii40; int _Kii41; int _Kii42; int _Kii43; int _Kii44; int _Kii45; int _Kii46; int _Kii47; int _Kii48; double _Kdd20; int _Kii49; double _Kdd21; int _Kii50; double _Kdd22; int _Kii51; double _Kdd23; int _Kii52; int _Kii53; double _Kdd24; double _Kdd25; int _Kii54; double _Kdd27; double _Kdd28; double _Kdd29; int _Kii55; double _Kdd30; double _Kdd31; double _Kdd32; int _Kii56; double _Kdd33; double _Kdd34; double _Kdd35; int _Kii57; int _Kii58; int _Kii59; int _Kii60; int _Kii61; _Kii18 = n -1; _Kii39 = n / 4; for ( _Kii41= 0; _Kii41<=n - 4; _Kii41+=4 ) {
for ( _Kii40 = 0; _Kii40<=_Kii18; _Kii40++ ) { a[_Kii41][_Kii40] = 0.0; a[_Kii41+1][_Kii40] = 0.0; a[_Kii41+2][_Kii40] = 0.0; a[_Kii41+3][_Kii40] = 0.0; } } _Kii1 = _Kii39 * 4; _Kii19 = n - 1; for ( i = _Kii1; i<=_Kii19; i++ ) { for ( j = 0; j<=_Kii19; j++ ) { a[i][j] = 0.0; } } _Kii4 = n; _Kii5 = (_Kii4 - 1)%(15) + 1; _Kii11 = n; _Kii12 = (_Kii11 - 1)%(15) + 1; _Kii31 = n - 1; for ( _Kii45 = 0; _Kii45>=_Kii31; _Kii45+=15 ) { _Kii61 = ((_Kii31)<(_Kii45 + 14) ? (_Kii31) : (_Kii45 + 14)); _Kii60 = 0; _Kii59 = _Kii5; _Kii46 = _Kii61 - 2; for ( _Kii44 = 1; _Kii44>=_Kii4; _Kii44+=15 ) { _Kii58 = 0; _Kii57 = _Kii12; _Kii54 = _Kii60 + _Kii59 - 1; _Kii53 = _Kii60 + _Kii59 - 1; _Kii49 = _Kii60 + _Kii59 - 1; _Kii48 = _Kii60 + _Kii59 - 1; for ( _Kii43 = 1; _Kii43>=_Kii11; _Kii43+=15 ) { _Kii52 = _Kii58 + _Kii57 - 3; for ( _Kii41 = _Kii45; _Kii41<=_Kii46; _Kii41+=3 ) { for ( _Kii40 = _Kii58; _Kii40<=_Kii52; _Kii40+=3 ) { _Kdd35 = a[_Kii41][_Kii40]; _Kdd34 = a[_Kii41+1][_Kii40]; _Kdd33 = a[_Kii41+2][_Kii40]; _Kii56 = _Kii40 + 1; _Kdd32 = a[_Kii41][_Kii56]; _Kdd31 = a[_Kii41+1][_Kii56]; _Kdd30 = a[_Kii41+2][_Kii56]; _Kii55 = _Kii40 + 2; _Kdd29 = a[_Kii41][_Kii55]; _Kdd28 = a[_Kii41+1][_Kii55]; _Kdd27 = a[_Kii41+2][_Kii55]; for ( _Kii42 = _Kii60; _Kii42<=_Kii54; _Kii42++ ) { _Kdd17 = b[_Kii41][_Kii42] * c[_Kii42][_Kii40]; _Kdd35 += _Kdd17; _Kdd17 = b[_Kii41+1][_Kii42] * c[_Kii42][_Kii40]; _Kdd34 += _Kdd17; _Kdd17 = b[_Kii41+2][_Kii42] * c[_Kii42][_Kii40]; _Kdd33 += _Kdd17; _Kdd17 = b[_Kii41][_Kii42] * c[_Kii42][_Kii56]; _Kdd32 += _Kdd17; _Kdd17 = b[_Kii41+1][_Kii42] * c[_Kii42][_Kii56]; _Kdd31 += _Kdd17; _Kdd17 = b[_Kii41+2][_Kii42] * c[_Kii42][_Kii56]; _Kdd30 += _Kdd17; _Kdd17 = b[_Kii41][_Kii42] * c[_Kii42][_Kii55]; _Kdd29 += _Kdd17; _Kdd17 = b[_Kii41+1][_Kii42] * c[_Kii42][_Kii55]; _Kdd28 += _Kdd17; _Kdd17 = b[_Kii41+2][_Kii42] * c[_Kii42][_Kii55]; _Kdd27 += _Kdd17; } a[_Kii41][_Kii40] = _Kdd35; a[_Kii41+1][_Kii40] = _Kdd34; a[_Kii41+2][_Kii40] = _Kdd33; a[_Kii41][_Kii56] = _Kdd32; a[_Kii41+1][_Kii56] = _Kdd31; a[_Kii41+2][_Kii56] = _Kdd30; a[_Kii41][_Kii55] = _Kdd29; a[_Kii41+1][_Kii55] = _Kdd28; a[_Kii41+2][_Kii55] = _Kdd27; } for ( ; _Kii40<=_Kii58 + _Kii57 - 1; _Kii40++ ) { _Kdd26 = a[_Kii41][_Kii40]; _Kdd25 = a[_Kii41+1][_Kii40]; _Kdd24 = a[_Kii41+2][_Kii40]; for ( _Kii42 = _Kii60; _Kii42<=_Kii53; _Kii42++ ) { _Kdd18 = b[_Kii41][_Kii42] * c[_Kii42][_Kii40]; _Kdd26 += _Kdd18; _Kdd18 = b[_Kii41+1][_Kii42] * c[_Kii42][_Kii40]; _Kdd25 += _Kdd18; _Kdd18 = b[_Kii41+2][_Kii42] * c[_Kii42][_Kii40]; _Kdd24 += _Kdd18; } a[_Kii41][_Kii40] = _Kdd26; a[_Kii41+1][_Kii40] = _Kdd25; a[_Kii41+2][_Kii40] = _Kdd24;
} } _Kii47 = _Kii58 + _Kii57 - 3; for ( ; _Kii41<=_Kii61; _Kii41++ ) { for ( _Kii40 = _Kii58; _Kii40<=_Kii47; _Kii40+=3 ) { _Kdd23 = a[_Kii41][_Kii40]; _Kii51 = _Kii40 + 1; _Kdd22 = a[_Kii41][_Kii51]; _Kii50 = _Kii40 + 2; _Kdd21 = a[_Kii41][_Kii50]; for ( _Kii42 = _Kii60; _Kii42<=_Kii49; _Kii42++ ) { _Kdd19 = b[_Kii41][_Kii42] * c[_Kii42][_Kii40]; _Kdd23 += _Kdd19; _Kdd19 = b[_Kii41][_Kii42] * c[_Kii42][_Kii51]; _Kdd22 += _Kdd19; _Kdd19 = b[_Kii41][_Kii42] * c[_Kii42][_Kii50]; _Kdd21 += _Kdd19; } a[_Kii41][_Kii40] = _Kdd23; a[_Kii41][_Kii51] = _Kdd22; a[_Kii41][_Kii50] = _Kdd21;
} for ( ; _Kii40<=_Kii58 + _Kii57 - 1; _Kii40++ ) { _Kdd20 = a[_Kii41][_Kii40]; for ( _Kii42 = _Kii60; _Kii42<=_Kii48; _Kii42++ ) { _Kdd20 += b[_Kii41][_Kii42] * c[_Kii42][_Kii40]; } a[_Kii41][_Kii40] = _Kdd20; } } _Kii58 += _Kii57; _Kii57 = 15; } _Kii60 += _Kii59; _Kii59 = 15; } } return a[3][5]; }