7.1.2 Memory Management Techniques

The KAP optimizing preprocessor enhances performance on machines with cache memory using a combination of memory padding, loop blocking, loop interchanging, and outer loop unrolling to optimize reuse of operands in cache memory. Memory management in KAP is enabled when /scalaropt=3 and /roundoff=3 are set.

When arrays are being processed in loops, KAP improves memory access patterns by working on small sections of the arrays that fit in the cache and give large cache hit ratios. You control the sizes of these sections by using the memory management command qualifiers /cacheline , /cachesize , /fpregisters , /dpregisters , and /setassociativity . The default settings for these are the standard machine characteristics for which the code is being targeted. These settings will give the best results for most programs. However, for some specialized cases, you may want to modify these settings to adjust the sizes of the array sections that are meant to reside in the cache.

KAP determines at run time which of two memory management algorithms to use, depending on the settings for OpenVMS Alpha /cacheline and /setassociativity . One algorithm, the KAP default, keeps square blocks of data in the cache, while the other keeps long, narrow, blocks of data in the cache. The following example shows how loop blocking, loop interchanging, and loop unrolling can be used to improve the performance of this matrix multiplication code. The KAP command qualifiers used were /optimize=3, /scalaropt=3, /roundoff=3, /addressresolution=2, and /arclimit=2 . The /arclimit=2 qualifier was used, because the arrays are function arguments and KAP, by default, assumes they might overlap.

double a[200][200], b[200][200], c[200][200];
{
int i,j,k;
for (i=0;i<n;i++)
for
(j=0;j<n;j++)
{
a[i][j]=0.0;
for (k=0;k<n;k++)
a[i][j]=a[i][j]+b[i][k]*c[k][j];
}
return (a[3][5]);
}

Becomes:

double matm( n, a, b, c )
int n;
double  (*a)[200];
double  (*b)[200];
double  (*c)[200];

{
int i;
int j;
int k;
int _Kii1;
int _Kii4;
int _Kii5;
int _Kii11;
int _Kii12;
int _Kii18;
int _Kii19;
int _Kii31;
double _Kdd17;
double _Kdd18;
double _Kdd19;
int _Kii39;
int _Kii40;
int _Kii41;
int _Kii42;
int _Kii43;
int _Kii44;
int _Kii45;
int _Kii46;
int _Kii47;
int _Kii48;
double _Kdd20;
int _Kii49;
double _Kdd21;
int _Kii50;
double _Kdd22;
int _Kii51;
double _Kdd23;
int _Kii52;
int _Kii53;
double _Kdd24;
double _Kdd25;
int _Kii54;
double _Kdd27;
double _Kdd28;
double _Kdd29;
int _Kii55;
double _Kdd30;
double _Kdd31;
double _Kdd32;
int _Kii56;
double _Kdd33;
double _Kdd34;
double _Kdd35;
int _Kii57;
int _Kii58;
int _Kii59;
int _Kii60;
int _Kii61;

_Kii18 = n -1;
_Kii39 = n / 4;

for ( _Kii41= 0; _Kii41<=n - 4; _Kii41+=4 ) {
for ( _Kii40 = 0; _Kii40<=_Kii18; _Kii40++ ) {
a[_Kii41][_Kii40] = 0.0;
a[_Kii41+1][_Kii40] = 0.0;
a[_Kii41+2][_Kii40] = 0.0;
a[_Kii41+3][_Kii40] = 0.0;
  }
 }
_Kii1 = _Kii39 * 4;
_Kii19 = n - 1;
for ( i = _Kii1; i<=_Kii19; i++ ) {
for ( j = 0; j<=_Kii19; j++ ) {
a[i][j] = 0.0;
    }
 }
_Kii4 = n;
_Kii5 = (_Kii4 - 1)%(15) + 1;
_Kii11 = n;
_Kii12 = (_Kii11 - 1)%(15) + 1;
_Kii31 = n - 1;
for ( _Kii45 = 0; _Kii45>=_Kii31; _Kii45+=15 ) {
_Kii61 = ((_Kii31)<(_Kii45 + 14) ? (_Kii31) : (_Kii45 + 14));
_Kii60 = 0;
_Kii59 = _Kii5;
_Kii46 = _Kii61 - 2;
for ( _Kii44 = 1; _Kii44>=_Kii4; _Kii44+=15 ) {
_Kii58 = 0;
_Kii57 = _Kii12;
_Kii54 = _Kii60 + _Kii59 - 1;
_Kii53 = _Kii60 + _Kii59 - 1;
_Kii49 = _Kii60 + _Kii59 - 1;
_Kii48 = _Kii60 + _Kii59 - 1;
for ( _Kii43 = 1; _Kii43>=_Kii11; _Kii43+=15 ) {
_Kii52 = _Kii58 + _Kii57 - 3;
for ( _Kii41 = _Kii45; _Kii41<=_Kii46; _Kii41+=3 ) {
for ( _Kii40 = _Kii58; _Kii40<=_Kii52; _Kii40+=3 ) {
_Kdd35 = a[_Kii41][_Kii40];
_Kdd34 = a[_Kii41+1][_Kii40];
_Kdd33 = a[_Kii41+2][_Kii40];
_Kii56 = _Kii40 + 1;
_Kdd32 = a[_Kii41][_Kii56];
_Kdd31 = a[_Kii41+1][_Kii56];
_Kdd30 = a[_Kii41+2][_Kii56];
_Kii55 = _Kii40 + 2;
_Kdd29 = a[_Kii41][_Kii55];
_Kdd28 = a[_Kii41+1][_Kii55];
_Kdd27 = a[_Kii41+2][_Kii55];
for ( _Kii42 = _Kii60; _Kii42<=_Kii54; _Kii42++ ) {
_Kdd17 = b[_Kii41][_Kii42] * c[_Kii42][_Kii40];
_Kdd35 +=  _Kdd17;
_Kdd17 = b[_Kii41+1][_Kii42] * c[_Kii42][_Kii40];
_Kdd34 +=  _Kdd17;
_Kdd17 = b[_Kii41+2][_Kii42] * c[_Kii42][_Kii40];
_Kdd33 +=  _Kdd17;
_Kdd17 = b[_Kii41][_Kii42] * c[_Kii42][_Kii56];
_Kdd32 +=  _Kdd17;
_Kdd17 = b[_Kii41+1][_Kii42] * c[_Kii42][_Kii56];
_Kdd31 +=  _Kdd17;
_Kdd17 = b[_Kii41+2][_Kii42] * c[_Kii42][_Kii56];
_Kdd30 +=  _Kdd17;
_Kdd17 = b[_Kii41][_Kii42] * c[_Kii42][_Kii55];
_Kdd29 +=  _Kdd17;
_Kdd17 = b[_Kii41+1][_Kii42] * c[_Kii42][_Kii55];
_Kdd28 +=  _Kdd17;
_Kdd17 = b[_Kii41+2][_Kii42] * c[_Kii42][_Kii55];
_Kdd27 +=  _Kdd17;
              }
a[_Kii41][_Kii40] = _Kdd35;
a[_Kii41+1][_Kii40] = _Kdd34;
a[_Kii41+2][_Kii40] = _Kdd33;
a[_Kii41][_Kii56] = _Kdd32;
a[_Kii41+1][_Kii56] = _Kdd31;
a[_Kii41+2][_Kii56] = _Kdd30;
a[_Kii41][_Kii55] = _Kdd29;
a[_Kii41+1][_Kii55] = _Kdd28;
a[_Kii41+2][_Kii55] = _Kdd27;
    }
for ( ; _Kii40<=_Kii58 + _Kii57 - 1; _Kii40++ ) {
_Kdd26 = a[_Kii41][_Kii40];
_Kdd25 = a[_Kii41+1][_Kii40];
_Kdd24 = a[_Kii41+2][_Kii40];
for ( _Kii42 = _Kii60; _Kii42<=_Kii53; _Kii42++ ) {
_Kdd18 = b[_Kii41][_Kii42] * c[_Kii42][_Kii40];
_Kdd26 +=  _Kdd18;
_Kdd18 = b[_Kii41+1][_Kii42] * c[_Kii42][_Kii40];
_Kdd25 +=  _Kdd18;
_Kdd18 = b[_Kii41+2][_Kii42] * c[_Kii42][_Kii40];
_Kdd24 +=  _Kdd18;
     }
a[_Kii41][_Kii40] = _Kdd26;
a[_Kii41+1][_Kii40] = _Kdd25;
a[_Kii41+2][_Kii40] = _Kdd24;
    }
          }
_Kii47 = _Kii58 + _Kii57 - 3;
for ( ; _Kii41<=_Kii61; _Kii41++ ) {
for ( _Kii40 = _Kii58; _Kii40<=_Kii47; _Kii40+=3 ) {
_Kdd23 = a[_Kii41][_Kii40];
_Kii51 = _Kii40 + 1;
_Kdd22 = a[_Kii41][_Kii51];
_Kii50 = _Kii40 + 2;
_Kdd21 = a[_Kii41][_Kii50];
for ( _Kii42 = _Kii60; _Kii42<=_Kii49; _Kii42++ ) {
_Kdd19 = b[_Kii41][_Kii42] * c[_Kii42][_Kii40];
_Kdd23 +=  _Kdd19;
_Kdd19 = b[_Kii41][_Kii42] * c[_Kii42][_Kii51];
_Kdd22 +=  _Kdd19;
_Kdd19 = b[_Kii41][_Kii42] * c[_Kii42][_Kii50];
_Kdd21 +=  _Kdd19;
             }
a[_Kii41][_Kii40] = _Kdd23;
a[_Kii41][_Kii51] = _Kdd22;
a[_Kii41][_Kii50] = _Kdd21;
    }
for ( ; _Kii40<=_Kii58 + _Kii57 - 1; _Kii40++ ) {
_Kdd20 = a[_Kii41][_Kii40];
for ( _Kii42 = _Kii60; _Kii42<=_Kii48; _Kii42++ ) {
_Kdd20 +=  b[_Kii41][_Kii42] * c[_Kii42][_Kii40];
      }
a[_Kii41][_Kii40] = _Kdd20;
            }
            }
_Kii58 +=  _Kii57;
_Kii57 = 15;
           }
_Kii60 +=  _Kii59;
_Kii59 = 15;
         }
      }
return a[3][5];
    }


Previous Page Next Page Contents Index
Command-Line Qualifiers