深入理解計算機系統 (CS:APP) - 高速緩存實驗 Cache Lab 解析

本文轉載自查看原文 2020-03-20 19:15 1811

原文地址：https://billc.io/2019/05/csapp-cachelab/

這個實驗是這學期的第四個實驗。作為緩存這一章的配套實驗，設計得非常精妙。難度上來講，相比之前的修改現成文件，直接寫一個程序也更高了一些。需要注意的是檢查程序在編譯時開啟了 -Werror，需要保證沒有警告才能成功編譯。

從官方文檔得知需要完善 csim.c 和 trans.c 文件，第一個是模擬一個高速緩存的程序並從由 valgrind 程序生成的 trace 文件中統計 hit, miss 和 eviction 的數量。第二個文件需要優化矩陣轉置程序降低程序的不命中度。

PART A

這一部分的核心是使用了一個結構體來模擬一個緩存行：

typedef struct {
    int valid;
    ulong tag;
    clock_t time;
} CacheLine;

再通過把緩存行在內存中動態分配成一個二維數組，實現模擬緩存的功能。並且使用了typedef CacheLine *CacheSet; 和 typedef CacheSet *CacheHead; 來讓程序更整齊。輸入來源於文件和命令行參數。可以用 getopt() 函數來解析參數。

各個函數的作用如下：

CacheHead CacheInit(int S, int E) 為緩存動態分配內存；
int CacheJudge(CacheHead cache, ulong index, ulong tag) 判斷緩存狀態，是否有效，標記匹配；
void CacheEvict(CacheHead cache, ulong index, ulong tag) 執行 eviction 操作；
void CacheTouch(CacheHead cache, ulong index, ulong tag) 執行讀取操作，只更新時間戳；
void CacheInsert(CacheHead cache, ulong index, ulong tag) 執行緩存寫入操作；
void Adder(int type, int num) 計數器，增加 hit, miss 和 eviction 的數量，並根據配置選擇打印信息；
void printByte(bytept h, int len) 逐字節以 16 進制打印內存數據；
void Execute(CacheHead cache, char type, ulong address, int len) 主要的執行函數；
int main(int argc, char *args[]) main 函數，讀取參數，打開文件；

完整的程序代碼如下：

// Written By @BillChen
// 2019.5.20
#include "cachelab.h"
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>

#define MACHINE_BITS 64
#define NEED_EVICT -1
#define NO_MATCH -2
#define CACHED 1
#define ADD_HIT 1
#define ADD_MISS 2
#define ADD_EVICT 3

int totalMissCount = 0;
int totalHitCount = 0;
int totalEvictCount = 0;

typedef unsigned long ulong;
typedef unsigned char *bytept;
const char *optString = "s:E:b:t:hVv";

struct globalOptions {
    int setIndexBits;
    int associativity;
    int blockBits;
    int verboseFlag;
    int tagBits;
    int superVerboseFlag;
    char *traceDir;
} globalOptions;
struct result {
    int hit;
    int miss;
    int evict;
};
typedef struct {
    int valid;
    ulong tag;
    clock_t time;
} CacheLine;

typedef CacheLine *CacheSet;
typedef CacheSet *CacheHead;

void usage() {
    printf("Usage: ./csim [-hv] -s <s> -E <E> -b <b> -t <tracefile>\n");
    printf("-h get help info\n");
    printf("-v Optional verbose flag that displays trace info\n");
    printf("-V Optional super verbose flag that displays very detailed trace info\n");
    printf("-s <s> Number of set index bits\n");
    printf("-E <E> Associativity (number of lines per set)\n");
    printf("-b <b> Number of block bits\n");
    printf("-t <tracefile>: Name of the valgrind trace to replay\n");
}

CacheHead CacheInit(int S, int E) {
    CacheHead cache;
    cache = calloc(1 << S, sizeof(CacheSet));
    if (cache == NULL) {
        printf("Fail to allocate memory for cache.\n");
        exit(EXIT_FAILURE);
    }
    int i = 0;
    for (i = 0; i < 1 << S; i++) {
        if ((cache[i] = calloc(E, sizeof(CacheLine))) == NULL) {
            printf("Fail to allocate memory for cache.\n");
            exit(EXIT_FAILURE);
        }
    }
    for (i = 0; i < 1 << S; i++) {
        int j;
        for (j = 0; j < E; j++) {
            cache[i][j].valid = 0;
        }
    }
    return cache;
}

int CacheJudge(CacheHead cache, ulong index, ulong tag) {
    int i;
    int fullFlag = 1;
    int matchFlag = 0;
    for (i = 0; i < globalOptions.associativity; i++) {
        if (cache[index][i].valid == 0) {
            fullFlag = 0;
        }
        if (cache[index][i].tag == tag && cache[index][i].valid == 1) {
            matchFlag = 1;
        }
    }
    if (matchFlag == 1)
        return CACHED;
    if (fullFlag == 1)
        return NEED_EVICT;
    else
        return NO_MATCH;
}

void CacheInsert(CacheHead cache, ulong index, ulong tag) {
    int freeLine = 0, i;
    for (i = 0; i < globalOptions.associativity; i++) {
        if (cache[index][i].valid == 0)
            break;
        freeLine++;
    }
    CacheLine *target = cache[index] + freeLine;
    target->tag = tag;
    target->valid = 1;
    target->time = clock();
}

void CacheEvict(CacheHead cache, ulong index, ulong tag) {
    int firstLine = 0, i = 0;
    clock_t firstCachedTime = cache[index][i].time;
    for (i = 0; i < globalOptions.associativity; i++) {
        if (cache[index][i].time < firstCachedTime) {
            firstCachedTime = cache[index][i].time;
            firstLine = i;
        }
    }
    CacheLine *target = cache[index] + firstLine;
    target->tag = 0;
    target->time = 0;
    target->valid = 0;
}

void CacheTouch(CacheHead cache, ulong index, ulong tag) {
    int touchLine = 0;
    while (cache[index][touchLine].tag != tag)
        touchLine++;
    cache[index][touchLine].time = clock();
}

void Adder(int type, int num) {
    int v = globalOptions.verboseFlag;
    switch (type) {
    case ADD_EVICT:
        totalEvictCount += num;
        if (v && num != 0)
            printf("eviction ");
        break;
    case ADD_HIT:
        totalHitCount += num;
        if (v && num != 0)
            printf("hit ");
        break;
    case ADD_MISS:
        totalMissCount += num;
        if (v && num != 0)
            printf("miss ");
    }
}

void printByte(bytept h, int len) {
    int i;
    for (i = 0; i < len; i++)
        printf("%.2x ", h[i]);
    printf("\n");
}

void Execute(CacheHead cache, char type, ulong address, int len) {
    ulong index = (address << globalOptions.tagBits) >> (MACHINE_BITS - globalOptions.setIndexBits);
    ulong tag = address >> (globalOptions.blockBits + globalOptions.setIndexBits);
    int status = CacheJudge(cache, index, tag);
    if (globalOptions.verboseFlag == 1) {
        if(globalOptions.superVerboseFlag == 1){
            printf("\n[address:] ");
            printByte((bytept)&address, sizeof(long));
            printf("[index:] ");
            printByte((bytept)&index, sizeof(long));
            printf("[tag:] ");
            printByte((bytept)&tag, sizeof(long));
            printf("(Decimal)[index: %ld, tag: %ld]\n------------------------------------------- ", index, tag);
        } 
        else{
            printf("(Decimal)[index: %ld, tag: %ld] ------ ", index, tag);
        }
    }
    switch (status) {
    case CACHED:
        CacheTouch(cache, index, tag);
        if (type == 'M') {
            Adder(ADD_HIT, 1);
            Adder(ADD_HIT, 1);
        } else {
            Adder(ADD_HIT, 1);
        }
        break;
    case NO_MATCH:
        CacheInsert(cache, index, tag);
        if (type == 'M') {
            Adder(ADD_MISS, 1);
            Adder(ADD_HIT, 1);
        } else {
            Adder(ADD_MISS, 1);
        }
        break;
    case NEED_EVICT:
        CacheEvict(cache, index, tag);
        CacheInsert(cache, index, tag);
        if (type == 'M') {
            Adder(ADD_MISS, 1);
            Adder(ADD_EVICT, 1);
            Adder(ADD_HIT, 1);

        } else {
            Adder(ADD_MISS, 1);
            Adder(ADD_EVICT, 1);
        }
        break;
    default:
        printf("Unknown error.\n");
        exit(EXIT_FAILURE);
    }
    if (globalOptions.verboseFlag == 1) {
        printf("\n");
    }
}

int main(int argc, char *args[]) {
    char ch;
    while ((ch = getopt(argc, args, optString)) != -1) {
        switch (ch) {
        case 's':
            if (atoi(optarg) < 0) {
                printf("Unvalid input for <s>. Try Again.\n");
                exit(EXIT_FAILURE);
            }
            globalOptions.setIndexBits = atoi(optarg);
            break;
        case 'E':
            if (atoi(optarg) < 0) {
                printf("Unvalid input for <E>. Try Again.\n");
                exit(EXIT_FAILURE);
            }
            globalOptions.associativity = atoi(optarg);
            break;
        case 'b':
            if (atoi(optarg) < 0) {
                printf("Unvalid input for <b>. Try Again.\n");
                exit(EXIT_FAILURE);
            }
            globalOptions.blockBits = atoi(optarg);
            break;
        case 't':
            globalOptions.traceDir = optarg;
            break;
        case 'v':
            globalOptions.verboseFlag = 1;
            break;
        case 'h':
            usage();
            exit(EXIT_FAILURE);
        case 'V':
            globalOptions.verboseFlag = 1;
            globalOptions.superVerboseFlag = 1;
            break;
        default:
            usage();
            exit(EXIT_FAILURE);
            break;
        }
    }
    globalOptions.tagBits = MACHINE_BITS - globalOptions.blockBits - globalOptions.setIndexBits;

    FILE *traceFile = fopen(globalOptions.traceDir, "r");
    if (traceFile == NULL) {
        printf("Fail to open file: %s\n", globalOptions.traceDir);
        exit(EXIT_FAILURE);
    }
    CacheHead cache = CacheInit(globalOptions.setIndexBits, globalOptions.associativity);
    char traceLine[32];
    while (fgets(traceLine, 32, traceFile) != NULL) {
        char mode;
        ulong address;
        int len;
        sscanf(traceLine, " %c %lx,%d", &mode, &address, &len);
        if (mode == 'I')
            continue;
        if (globalOptions.verboseFlag == 1) {
            printf("%c %lx,%d ", mode, address, len);
        }
        Execute(cache, mode, address, len);
    }
    printSummary(totalHitCount, totalMissCount, totalEvictCount);
    free(cache);
    return 0;
}

最終在 ./driver.py 的測試下，該程序和 csim-ref 的運行結果一致。

PART B

按照官方文檔的說明，需要在 trans.c 中寫入一個優化的矩陣轉置函數。盡可能地降低不命中率。使用命令 ./test-trans -M <rol> -N <col> 可以查看這一轉置函數的不命中數。生成的 trace.fi 文件還可以利用 PART A 寫的緩存模擬器檢查命中情況。

從官方文檔得知要在 PART B 中得到分數需要完成三個測試並滿足對應的不命中數條件。

Test I: 32 * 32

由於程序使用的緩存 block size 為 5，也就是有 2^5 的塊大小，為32字節。sizeof(int) = 4，所以可以存儲下 8 個整數。

先研究原始的一個簡單的矩陣轉置函數：

int i, j, tmp;
for (i = 0; i < N; i++) {
    for (j = 0; j < M; j++) {
        tmp = A[i][j];
        B[j][i] = tmp;
    }
}

這一函數的運行結果出現了 1000 多個 miss。提取一小部分原始的文件，利用 csim 查看詳細的 miss 和 eviction 信息，可以發現在讀取的時候發生了嚴重的抖動，導致了大量 miss 的出現。

所以可以利用矩陣分塊的思想。每一行數組都可以被存入 4 個緩存行中，一共有 32 個緩存行，所以每過 8 行就會出現一次和前面相同的組索引，發生 miss 和 eviction。所以考慮將 32 * 32 的矩陣分成 16 個 8 * 8 的矩陣，每一次都將一行的 8 個 int 分別存儲進 t1 – t4。

即，將矩陣划分成如下結構：

1	2	3	4
5	6	7	8
9	10	11	12
13	14	15	16

其中每一個小塊都是 8 * 8，每一行能夠完整存儲到緩存行中的矩陣。這種情況在 transpose_submit() 中的代碼如下：

if(N == 32 && M == 32){
    int i, j, k;
    int t1, t2, t3, t4, t5, t6, t7, t8;
    for (i = 0; i < 32; i += 8) {
        for (j = 0; j < 32; j += 8) {
            for (k = 0; k < 8; k++) {
                t1 = A[i + k][j];
                t2 = A[i + k][j + 1];
                t3 = A[i + k][j + 2];
                t4 = A[i + k][j + 3];
                t5 = A[i + k][j + 4];
                t6 = A[i + k][j + 5];
                t7 = A[i + k][j + 6];
                t8 = A[i + k][j + 7];
                B[j][i + k] = t1;
                B[j + 1][i + k] = t2;
                B[j + 2][i + k] = t3;
                B[j + 3][i + k] = t4;
                B[j + 4][i + k] = t5;
                B[j + 5][i + k] = t6;
                B[j + 6][i + k] = t7;
                B[j + 7][i + k] = t8;
            }
        }
    }
}

結果如下圖所示：

Test II: 64 * 64

和第一種情況測試類似。但是由於大小變成了 64 * 64，每過 4 行就會出現一次沖突的情況。所以可以先分成 8 * 8 的塊，然后再把 8 * 8 的塊分成 4 個 4 * 4 的塊。讀取一行，但存儲進的位置如圖所示。逆序存儲之后再逐行處理 C’ 和 B’ 處的數據。

由於之前是逆序存儲的，所以在 C’ 會把 0 加載進緩存，而在 B’ 會把 24 加載進緩存，再利用 t1, t2, t3, t4 四個變量作臨時變量存儲，交換 0 行和 24 行的位置。

這一部分比較復雜，這里參考了歐陽松的博客（https://www.ouyangsong.com/posts/55291/#fn4），大概的邏輯如下圖所示：

具體的代碼實現如下：

else if (N == 64 && M == 64) {
    int t0, t1, t2, t3, t4, t5, t6, t7;
    for (int i = 0; i < N; i += 8) {
        for (int j = 0; j < M; j += 8) {
            for (int k = i; k < i + 4; k++) {
                t0 = A[k][j];
                t1 = A[k][j + 1];
                t2 = A[k][j + 2];
                t3 = A[k][j + 3];
                t4 = A[k][j + 4];
                t5 = A[k][j + 5];
                t6 = A[k][j + 6];
                t7 = A[k][j + 7];
                B[j][k] = t0;
                B[j + 1][k] = t1;
                B[j + 2][k] = t2;
                B[j + 3][k] = t3;
                B[j + 0][k + 4] = t7;
                B[j + 1][k + 4] = t6;
                B[j + 2][k + 4] = t5;
                B[j + 3][k + 4] = t4;
            }
            for (int h = 0; h < 4; h++) {
                t0 = A[i + 4][j + 3 - h];
                t1 = A[i + 5][j + 3 - h];
                t2 = A[i + 6][j + 3 - h];
                t3 = A[i + 7][j + 3 - h];
                t4 = A[i + 4][j + 4 + h];
                t5 = A[i + 5][j + 4 + h];
                t6 = A[i + 6][j + 4 + h];
                t7 = A[i + 7][j + 4 + h];
                B[j + 4 + h][i + 0] = B[j + 3 - h][i + 4];
                B[j + 4 + h][i + 1] = B[j + 3 - h][i + 5];
                B[j + 4 + h][i + 2] = B[j + 3 - h][i + 6];
                B[j + 4 + h][i + 3] = B[j + 3 - h][i + 7];
                B[j + 3 - h][i + 4] = t0;
                B[j + 3 - h][i + 5] = t1;
                B[j + 3 - h][i + 6] = t2;
                B[j + 3 - h][i + 7] = t3;
                B[j + 4 + h][i + 4] = t4;
                B[j + 4 + h][i + 5] = t5;
                B[j + 4 + h][i + 6] = t6;
                B[j + 4 + h][i + 7] = t7;
            }
        }
    }
}

得到如下結果：

Test III: 61 * 67

這一測試中由於矩陣不規則，而且也不是 8 的倍數，所以在行與行之間沒有特別明顯的沖突不命中的關系。可以嘗試用分塊矩陣的方式優化。經過嘗試 8 * 8 的分塊和 16 * 16 的分塊后，發現使用 16 * 16 的分塊方式可以將 miss 數降低到 2000 以下。

這一部分的代碼如下：

else {
    int i, j, k, h;
    for (i = 0; i < N; i += 16) {
        for (j = 0; j < M; j += 16) {
            for (k = i; k < i + 16 && k < N; k++) {
                for (h = j; h < j + 16 && h < M; h++) {
                    B[h][k] = A[k][h];
                }
            }
        }
    }
}

可以得到 1992 的 miss 數。

最終在 ./driver.py 的運行結果中，Part B 獲得如下結果：

本實驗的完整代碼可以在這里下：https://github.com/BillChen2000/LearningRepo/blob/master/Course/CSAPP/LAB4/billchen-handin.tar

一如既往地，現在又是凌晨了 orz.

2019.5.22

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。