#include <stdio.h> #include <unistd.h> int main(int argc, char **argv) { int a[1000][1000]; if(1 == argc) { for(int i = 0; i < 1000; ++i) { for(int j = 0; j < 1000; ++j) { a[i][j] = 0; } } } else { for(int i = 0; i < 1000; ++i) { for(int j = 0; j < 1000; ++j) { a[j][i] = 0; } } } return 0; }
上面有兩個小程序片段, 哪段效率高? 顯然, 第一段效率高, 為什么呢? 因為在C/C++中,數組是按行存儲的,程序的按行訪問可以充分利用程序的局部性原理(空間局部性), 用time命令來看看結果:
[root@bogon c++]# g++ miss.c -o miss [root@bogon c++]# ./miss [root@bogon c++]# time ./miss real 0m0.009s user 0m0.009s sys 0m0.000s [root@bogon c++]# time ./miss 1 real 0m0.013s user 0m0.013s sys 0m0.000s [root@bogon c++]# time ./miss real 0m0.010s user 0m0.010s sys 0m0.000s [root@bogon c++]# time ./miss 1 real 0m0.013s user 0m0.013s sys 0m0.000s
[root@bogon c++]# perf stat -e L1-dcache-load-misses ./miss Performance counter stats for './miss': 88,780 L1-dcache-load-misses 0.009002291 seconds time elapsed 0.009174000 seconds user 0.000000000 seconds sys [root@bogon c++]# perf stat -e L1-dcache-load-misses ./miss 1 Performance counter stats for './miss 1': 1,015,683 L1-dcache-load-misses 0.012000335 seconds time elapsed 0.006059000 seconds user 0.006059000 seconds sys [root@bogon c++]# perf stat -e L1-dcache-load-misses ./miss 1 Performance counter stats for './miss 1': 1,015,363 L1-dcache-load-misses 0.012145156 seconds time elapsed 0.006134000 seconds user 0.006134000 seconds sys [root@bogon c++]# perf stat -e L1-dcache-load-misses ./miss 0 Performance counter stats for './miss 0': 1,011,740 L1-dcache-load-misses 0.012363858 seconds time elapsed 0.012484000 seconds user 0.000000000 seconds sys [root@bogon c++]# perf stat -e L1-dcache-load-misses ./miss 0 Performance counter stats for './miss 0': 1,015,347 L1-dcache-load-misses 0.012348778 seconds time elapsed 0.006237000 seconds user 0.006237000 seconds sys