One way KAP enhances performance on cache-memory machines is to
use a combination of loop blocking, loop interchanging, and loop
unrolling to optimize the cache hit ratios of operands in loops.
Memory management in KAP is enabled when /scalaropt=3
and /roundoff=3 have been specified. These are the
default settings for these qualifiers.
When arrays are being processed in loops, KAP improves memory access
patterns by restructuring the code to work on small sections of
the arrays that fit in the cache and thereby giving large cache
hit ratios. The sizes of these sections are controlled by using
the memory management command qualifiers /cacheline ,
/cachesize , /fpregisters ,
/dpregisters , and /setassociativity .
The default settings for these are the standard machine characteristics for which the code is being targeted. These default settings will give the best results for most programs. For some special cases, you may want to modify these settings to adjust the sizes of the array sections that are meant to reside in the cache. The algorithm that KAP uses keeps square blocks of data in the cache.
The following matrix multiplication example shows how loop interchanging, loop blocking, scalar temporary variables, and strip mining can be used. Since these techniques introduce some additional overhead, they are not used with small, simple loops.
DIMENSION A2(N,N),B2(N,N),C2(N,N),D2(N,N)
DO I = 1, N
DO J = 1, N
DO K = 1, N
C2(I,J) = C2(I,J) + A2(I,K) * B2(K,J)
ENDDO
ENDDO
ENDDO
Becomes:
DIMENSION A2(N,N), B2(N,N), C2(N,N), D2(N,N)
INTEGER II1, II2, II3, II4, II7, II8, II9, II10, II13, II14, II15
X, II16, II19, II20, II21, II22, II23, II24, II25, II26, II27,
X II28
REAL RR1, RR2, RR3, RR4, RR5, RR6, RR7, RR8, RR9, RR10, RR11,
X RR12, RR13, RR14, RR15, RR16, RR17, RR18, RR19
II3 = 1
II1 = MOD (N - 1, 21) + 1
II2 = II1
II7 = MOD (N - 1, 21) + 1
II13 = MOD (N - 1, 21) + 1
DO 12 II4=1,N,21
II9 = 1
II8 = II7
II27 = II3 + II2 - 3
II28 = II3 + II2 - 1
DO II10=1,N,21
II15 = 1
II14 = II13
II19 = II9 + II8 - 1
II20 = II9 + II8 - 1
II24 = II9 + II8 - 1
II23 = II9 + II8 - 1
DO 10 II16=1,N,21
II21 = II15 + II14 - 3
II22 = II15 + II14 -1
DO 5 J=II3,II27,3
DO 3 I=II15,II21,3
RR1 = C2(I,J)
RR2 = C2(I+1,J)
RR3 = C2(I,J+1)
RR4 = C2(I+1,J+1)
RR5 = C2(I,J+2)
RR6 = C2(I+1,J+2)
RR7 = C2(I+2,J)
RR8 = C2(I+2,J+1)
RR9 = C2(I+2,J+2)
DO 2 K=II9,II19,1
RR17 = A2(I,K) * B2(K,J)
RR1 = RR1 + RR17
RR17 = A2(I+1,K) * B2(K,J)
RR2 = RR2 + RR17
RR17 = A2(I,K) * B2(K,J+1)
RR3 = RR3 + RR17
RR17 = A2(I+1,K) * B2(K,J+1)
RR4 = RR4 + RR17
RR17 = A2(I,K) * B2(K,J+2)
RR5 = RR5 + RR17
RR17 = A2(I+1,K) * B2(K,J+2)
RR6 = RR6 + RR17
RR17 = A2(I+2,K) * B2(K,J)
RR7 = RR7 + RR17
RR17 = A2(I+2,K) * B2(K,J+1)
RR8 = RR8 + RR17
RR17 = A2(I+2,K) * B2(K,J+2)
RR9 = RR9 + RR17
2 CONTINUE
C2(I,J) = RR1
C2(I+1,J) = RR2
C2(I,J+1) = RR3
C2(I+1,J+1) = RR4
C2(I,J+2) = RR5
C2(I+1,J+2) = RR6
C2(I+2,J) = RR7
C2(I+2,J+1) = RR8
C2(I+2,J+2) = RR9
3 CONTINUE
DO 5 I=I,II22,1
RR10 = C2(I,J)
RR11 = C2(I,J+1)
RR12 = C2(I,J+2)
DO 4 K=II9,II20,1
RR18 = A2(I,K) * B2(K,J)
RR10 = RR10 + RR18
RR18 = A2(I,K) * B2(K,J+1)
RR11 = RR11 + RR18
RR18 = A2(I,K) * B2(K,J+2)
RR12 = RR12 + RR18
4 CONTINUE
C2(I,J) = RR10
C2(I,J+1) = RR11
C2(I,J+2) = RR12
5 CONTINUE
II25 = II15 + II14 - 3
II26 = II15 + II14 - 1
DO 9 J=J,II28,1
DO 7 I=II15,II25,3
RR13 = C2(I,J)
RR14 = C2(I+1,J)
RR15 = C2(I+2,J)
DO 6 K=II9,II23,1
RR19 = A2(I,K) * B2(K,J)
RR13 = RR13 + RR19
RR19 = A2(I+1,K) * B2(K,J)
RR14 = RR14 + RR19
RR19 = A2(I+2,K) * B2(K,J)
RR15 = RR15 + RR19
6 CONTINUE
C2(I,J) = RR13
C2(I+1,J) = RR14
C2(I+2,J) = RR15
7 CONTINUE
DO 9 I=I,II26,1
RR16 = C2(I,J)
DO 8 K=II9,II24,1
RR16 = RR16 + A2(I,K) * B2(K,J)
8 CONTINUE
C2(I,J) = RR16
9 CONTINUE
II15 = II15 + II14
II14 = 21
10 CONTINUE
II9 = II9 + II8
II8 = 21
11 CONTINUE
II3 = II3 + II2
II2 = 21
12 CONTINUE