diff --git a/scenario4-cache-misses/Makefile b/scenario4-cache-misses/Makefile index 37dd626..3f71fb4 100644 --- a/scenario4-cache-misses/Makefile +++ b/scenario4-cache-misses/Makefile @@ -1,15 +1,14 @@ CC = gcc CFLAGS = -O2 -Wall -all: cache_demo list_vs_array +TARGETS = matrix_col_major matrix_row_major list_scattered list_sequential array_sum -cache_demo: cache_demo.c - $(CC) $(CFLAGS) -o $@ $< +all: $(TARGETS) -list_vs_array: list_vs_array.c +%: %.c $(CC) $(CFLAGS) -o $@ $< clean: - rm -f cache_demo list_vs_array + rm -f $(TARGETS) .PHONY: all clean diff --git a/scenario4-cache-misses/README.md b/scenario4-cache-misses/README.md index d5393cb..e5b1fbe 100644 --- a/scenario4-cache-misses/README.md +++ b/scenario4-cache-misses/README.md @@ -26,25 +26,49 @@ Key concepts: - **Temporal locality**: Recently accessed data is likely to be accessed again ## Files -- `cache_demo.c` - Row-major vs column-major 2D array traversal -- `list_vs_array.c` - Array vs linked list traversal +- `matrix_col_major.c` - BAD: Column-major traversal (cache-hostile) +- `matrix_row_major.c` - GOOD: Row-major traversal (cache-friendly) +- `list_scattered.c` - BAD: Scattered linked list (worst cache behavior) +- `list_sequential.c` - MEDIUM: Sequential linked list (better, but still has overhead) +- `array_sum.c` - GOOD: Contiguous array (best cache behavior) -## Exercise 1: Row vs Column Major +## Setup -### Step 1: Build and run ```bash -make cache_demo -./cache_demo +make all ``` -You should see column-major is significantly slower (often 3-10x). +--- -### Step 2: Measure cache misses +## Exercise 1: Row-Major vs Column-Major Matrix Traversal + +### Step 1: Run the BAD version (column-major) ```bash -perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo +./matrix_col_major ``` -Compare the cache miss counts and ratios. +Note the execution time. + +### Step 2: Profile to identify the issue +```bash +perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_col_major +``` + +Observe the high cache miss rate and count. + +### Step 3: Run the GOOD version (row-major) +```bash +./matrix_row_major +``` + +This should be significantly faster (often 3-10x). + +### Step 4: Profile to confirm the improvement +```bash +perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_row_major +``` + +Compare the cache miss counts and ratios with the column-major version. ### Why does this happen? @@ -67,20 +91,51 @@ Cache: [█_______________] ← load entire line, use 1 int, evict [█_______________] ← repeat for each access ``` -## Exercise 2: Array vs Linked List +--- -### Step 1: Build and run +## Exercise 2: Data Structure Memory Layout + +### Step 1: Run the WORST case (scattered linked list) ```bash -make list_vs_array -./list_vs_array +./list_scattered ``` -### Step 2: Measure cache behavior +Note the execution time - this is the worst case. + +### Step 2: Profile the cache behavior ```bash -perf stat -e cache-misses,cache-references ./list_vs_array +perf stat -e cache-misses,cache-references ./list_scattered ``` -### Three cases compared: +Observe the terrible cache miss rate due to random memory access. + +### Step 3: First improvement - sequential allocation +```bash +./list_sequential +``` + +This should be faster than scattered, as nodes are contiguous in memory. + +### Step 4: Profile the improvement +```bash +perf stat -e cache-misses,cache-references ./list_sequential +``` + +Cache behavior improves, but still not optimal due to pointer chasing. + +### Step 5: Best solution - contiguous array +```bash +./array_sum +``` + +This should be the fastest by a significant margin. + +### Step 6: Profile the optimal case +```bash +perf stat -e cache-misses,cache-references ./array_sum +``` + +Compare all three cache miss counts: | Case | Memory Layout | Cache Behavior | |------|---------------|----------------| @@ -88,17 +143,22 @@ perf stat -e cache-misses,cache-references ./list_vs_array | List (sequential) | Contiguous (lucky!) | Good - nodes happen to be adjacent | | List (scattered) | Random | Terrible - every access misses | -### Why "sequential list" is still slower than array: +### Why linked lists are slow -1. **Pointer chasing**: CPU can't prefetch next element (doesn't know address) +Even with sequential allocation, linked lists are slower than arrays: + +1. **Pointer chasing**: CPU can't prefetch next element (doesn't know address until current node is loaded) 2. **Larger elements**: `struct node` is bigger than `int` (includes pointer) 3. **Indirect access**: Extra memory load for the `next` pointer +--- + ## Exercise 3: Deeper perf Analysis ### See more cache events ```bash -perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./cache_demo +perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./matrix_col_major +perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./matrix_row_major ``` Events explained: @@ -110,12 +170,14 @@ Events explained: ### Profile with perf record ```bash -perf record -e cache-misses ./cache_demo +perf record -e cache-misses ./matrix_col_major perf report ``` This shows which functions cause the most cache misses. +--- + ## Discussion Questions 1. **Why doesn't the compiler fix this?** diff --git a/scenario4-cache-misses/array_sum.c b/scenario4-cache-misses/array_sum.c new file mode 100644 index 0000000..51db4dd --- /dev/null +++ b/scenario4-cache-misses/array_sum.c @@ -0,0 +1,65 @@ +/* + * GOOD: Contiguous Array Traversal + * ================================= + * This program uses a contiguous array for excellent cache locality. + * The CPU prefetcher can predict sequential access patterns. + * + * Compile: make array_sum + * Run: ./array_sum + * Profile: perf stat -e cache-misses,cache-references ./array_sum + */ + +#include +#include +#include + +#define N 10000000 /* 10 million elements */ + +double get_time(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec / 1e9; +} + +long sum_array(int *arr, int n) { + long sum = 0; + for (int i = 0; i < n; i++) { + sum += arr[i]; + } + return sum; +} + +int *create_array(int n) { + int *arr = malloc(n * sizeof(int)); + if (!arr) { + perror("malloc array"); + exit(1); + } + for (int i = 0; i < n; i++) { + arr[i] = i % 100; + } + return arr; +} + +int main(void) { + printf("Contiguous Array Traversal (%d elements)\n", N); + printf("Sequential memory access - CPU prefetcher works perfectly.\n\n"); + + printf("Creating array...\n"); + int *arr = create_array(N); + + /* Warm up */ + sum_array(arr, N); + + double start = get_time(); + long result = sum_array(arr, N); + double elapsed = get_time() - start; + + printf("Array sum: %ld in %.4f seconds\n\n", result, elapsed); + + printf("To see cache behavior, run:\n"); + printf(" perf stat -e cache-misses,cache-references ./array_sum\n"); + + free(arr); + return 0; +} diff --git a/scenario4-cache-misses/cache_demo.c b/scenario4-cache-misses/cache_demo.c deleted file mode 100644 index f1887c1..0000000 --- a/scenario4-cache-misses/cache_demo.c +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Scenario 4: Cache Misses - Memory Access Patterns - * ================================================== - * This program demonstrates the performance impact of memory access patterns. - * Row-major vs column-major traversal of a 2D array. - * - * Compile: gcc -O2 -o cache_demo cache_demo.c - * - * EXERCISES: - * 1. Run: ./cache_demo - * 2. Profile: perf stat -e cache-misses,cache-references ./cache_demo - * 3. Why is one so much faster? - */ - -#include -#include -#include -#include - -#define ROWS 8192 -#define COLS 8192 - -/* - * Global array to ensure it's not optimized away. - * This is a 64MB array (8192 * 8192 * sizeof(int) = 256MB if int is 4 bytes) - * Wait, that's too big. Let's use smaller dimensions or chars. - */ - -/* Using static to avoid stack overflow */ -static int matrix[ROWS][COLS]; - -double get_time(void) { - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return ts.tv_sec + ts.tv_nsec / 1e9; -} - -long sum_row_major(void) { - /* - * Row-major traversal: access sequential memory addresses - * Memory layout: [0][0], [0][1], [0][2], ... [0][COLS-1], [1][0], ... - * This matches how C stores 2D arrays - CACHE FRIENDLY - */ - long sum = 0; - for (int i = 0; i < ROWS; i++) { - for (int j = 0; j < COLS; j++) { - sum += matrix[i][j]; - } - } - return sum; -} - -long sum_col_major(void) { - /* - * Column-major traversal: jump around in memory - * Access pattern: [0][0], [1][0], [2][0], ... [ROWS-1][0], [0][1], ... - * Each access is COLS * sizeof(int) bytes apart - CACHE HOSTILE - */ - long sum = 0; - for (int j = 0; j < COLS; j++) { - for (int i = 0; i < ROWS; i++) { - sum += matrix[i][j]; - } - } - return sum; -} - -void init_matrix(void) { - /* Initialize with some values */ - for (int i = 0; i < ROWS; i++) { - for (int j = 0; j < COLS; j++) { - matrix[i][j] = (i + j) % 100; - } - } -} - -int main(void) { - printf("Matrix size: %d x %d = %zu bytes\n", - ROWS, COLS, sizeof(matrix)); - printf("Cache line size (typical): 64 bytes\n"); - printf("Stride in column-major: %zu bytes\n\n", COLS * sizeof(int)); - - init_matrix(); - - double start, elapsed; - long result; - - /* Warm up */ - result = sum_row_major(); - result = sum_col_major(); - - /* Row-major benchmark */ - start = get_time(); - result = sum_row_major(); - elapsed = get_time() - start; - printf("Row-major sum: %ld in %.3f seconds\n", result, elapsed); - - /* Column-major benchmark */ - start = get_time(); - result = sum_col_major(); - elapsed = get_time() - start; - printf("Column-major sum: %ld in %.3f seconds\n", result, elapsed); - - printf("\n"); - printf("To see cache misses, run:\n"); - printf(" perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo\n"); - - return 0; -} diff --git a/scenario4-cache-misses/list_scattered.c b/scenario4-cache-misses/list_scattered.c new file mode 100644 index 0000000..d8a3b68 --- /dev/null +++ b/scenario4-cache-misses/list_scattered.c @@ -0,0 +1,123 @@ +/* + * BAD: Scattered Linked List Traversal + * ===================================== + * This program creates a linked list with nodes scattered randomly in memory, + * simulating real-world fragmented allocation patterns. + * This causes terrible cache behavior due to random memory access. + * + * Compile: make list_scattered + * Run: ./list_scattered + * Profile: perf stat -e cache-misses,cache-references ./list_scattered + */ + +#include +#include +#include +#include + +#define N 10000000 /* 10 million elements */ + +struct node { + int value; + struct node *next; +}; + +double get_time(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec / 1e9; +} + +/* Fast deterministic PRNG - much faster than rand() */ +static uint64_t xorshift64_state = 42; + +static inline uint64_t xorshift64(void) { + uint64_t x = xorshift64_state; + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + xorshift64_state = x; + return x; +} + +long sum_list(struct node *head) { + long sum = 0; + struct node *curr = head; + while (curr != NULL) { + sum += curr->value; + curr = curr->next; + } + return sum; +} + +/* + * Create linked list with nodes scattered in memory (worst case for cache) + * Each node is allocated individually, then shuffled and linked randomly. + */ +struct node *create_list_scattered(int n) { + struct node **nodes = malloc(n * sizeof(struct node *)); + if (!nodes) { + perror("malloc"); + exit(1); + } + + /* Allocate each node separately - they end up scattered in heap */ + for (int i = 0; i < n; i++) { + nodes[i] = malloc(sizeof(struct node)); + if (!nodes[i]) { + perror("malloc node"); + exit(1); + } + nodes[i]->value = i % 100; + } + + /* Shuffle the order (Fisher-Yates) to ensure random access pattern */ + for (int i = n - 1; i > 0; i--) { + int j = xorshift64() % (i + 1); + struct node *tmp = nodes[i]; + nodes[i] = nodes[j]; + nodes[j] = tmp; + } + + /* Link them in shuffled order */ + for (int i = 0; i < n - 1; i++) { + nodes[i]->next = nodes[i + 1]; + } + nodes[n - 1]->next = NULL; + + struct node *head = nodes[0]; + free(nodes); /* Free the pointer array, not the nodes */ + return head; +} + +void free_scattered_list(struct node *head) { + while (head != NULL) { + struct node *next = head->next; + free(head); + head = next; + } +} + +int main(void) { + printf("Scattered Linked List Traversal (%d elements)\n", N); + printf("Each node allocated individually, then linked in random order.\n"); + printf("This causes maximum cache thrashing.\n\n"); + + printf("Creating scattered linked list (this takes a while)...\n"); + struct node *list = create_list_scattered(N); + + /* Warm up */ + sum_list(list); + + double start = get_time(); + long result = sum_list(list); + double elapsed = get_time() - start; + + printf("Scattered list sum: %ld in %.4f seconds\n\n", result, elapsed); + + printf("To see cache behavior, run:\n"); + printf(" perf stat -e cache-misses,cache-references ./list_scattered\n"); + + free_scattered_list(list); + return 0; +} diff --git a/scenario4-cache-misses/list_sequential.c b/scenario4-cache-misses/list_sequential.c new file mode 100644 index 0000000..c11c834 --- /dev/null +++ b/scenario4-cache-misses/list_sequential.c @@ -0,0 +1,83 @@ +/* + * MEDIUM: Sequential Linked List Traversal + * ========================================= + * This program creates a linked list with nodes allocated contiguously, + * representing the best-case scenario for linked lists. + * Still slower than arrays due to pointer chasing overhead. + * + * Compile: make list_sequential + * Run: ./list_sequential + * Profile: perf stat -e cache-misses,cache-references ./list_sequential + */ + +#include +#include +#include + +#define N 10000000 /* 10 million elements */ + +struct node { + int value; + struct node *next; +}; + +double get_time(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec / 1e9; +} + +long sum_list(struct node *head) { + long sum = 0; + struct node *curr = head; + while (curr != NULL) { + sum += curr->value; + curr = curr->next; + } + return sum; +} + +/* + * Create linked list with nodes allocated sequentially (best case for list) + * All nodes allocated in one contiguous block, linked in order. + */ +struct node *create_list_sequential(int n) { + struct node *nodes = malloc(n * sizeof(struct node)); + if (!nodes) { + perror("malloc list"); + exit(1); + } + + for (int i = 0; i < n - 1; i++) { + nodes[i].value = i % 100; + nodes[i].next = &nodes[i + 1]; + } + nodes[n - 1].value = (n - 1) % 100; + nodes[n - 1].next = NULL; + + return nodes; +} + +int main(void) { + printf("Sequential Linked List Traversal (%d elements)\n", N); + printf("All nodes allocated contiguously - best case for linked list.\n"); + printf("Still has pointer chasing overhead vs array.\n\n"); + + printf("Creating sequential linked list...\n"); + struct node *list = create_list_sequential(N); + + /* Warm up */ + sum_list(list); + + double start = get_time(); + long result = sum_list(list); + double elapsed = get_time() - start; + + printf("Sequential list sum: %ld in %.4f seconds\n\n", result, elapsed); + + printf("To see cache behavior, run:\n"); + printf(" perf stat -e cache-misses,cache-references ./list_sequential\n"); + + free(list); + return 0; +} diff --git a/scenario4-cache-misses/list_vs_array.c b/scenario4-cache-misses/list_vs_array.c deleted file mode 100644 index 4c370e7..0000000 --- a/scenario4-cache-misses/list_vs_array.c +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Scenario 4b: Array vs Linked List Traversal - * ============================================ - * Arrays have excellent cache locality; linked lists do not. - * This demonstrates why "O(n) vs O(n)" can have very different constants. - * - * Compile: gcc -O2 -o list_vs_array list_vs_array.c - */ - -#include -#include -#include - -#define N 10000000 /* 10 million elements */ - -struct node { - int value; - struct node *next; -}; - -double get_time(void) { - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return ts.tv_sec + ts.tv_nsec / 1e9; -} - -/* Sum array elements */ -long sum_array(int *arr, int n) { - long sum = 0; - for (int i = 0; i < n; i++) { - sum += arr[i]; - } - return sum; -} - -/* Sum linked list elements */ -long sum_list(struct node *head) { - long sum = 0; - struct node *curr = head; - while (curr != NULL) { - sum += curr->value; - curr = curr->next; - } - return sum; -} - -/* Create array */ -int *create_array(int n) { - int *arr = malloc(n * sizeof(int)); - if (!arr) { - perror("malloc array"); - exit(1); - } - for (int i = 0; i < n; i++) { - arr[i] = i % 100; - } - return arr; -} - -/* Create linked list - nodes allocated sequentially (best case for list) */ -struct node *create_list_sequential(int n) { - struct node *nodes = malloc(n * sizeof(struct node)); - if (!nodes) { - perror("malloc list"); - exit(1); - } - - for (int i = 0; i < n - 1; i++) { - nodes[i].value = i % 100; - nodes[i].next = &nodes[i + 1]; - } - nodes[n - 1].value = (n - 1) % 100; - nodes[n - 1].next = NULL; - - return nodes; -} - -/* Create linked list - nodes allocated randomly (worst case for cache) */ -struct node *create_list_scattered(int n) { - /* Allocate nodes individually to scatter them in memory */ - struct node **nodes = malloc(n * sizeof(struct node *)); - if (!nodes) { - perror("malloc"); - exit(1); - } - - /* Allocate each node separately */ - for (int i = 0; i < n; i++) { - nodes[i] = malloc(sizeof(struct node)); - if (!nodes[i]) { - perror("malloc node"); - exit(1); - } - nodes[i]->value = i % 100; - } - - /* Shuffle the order (Fisher-Yates) */ - srand(42); - for (int i = n - 1; i > 0; i--) { - int j = rand() % (i + 1); - struct node *tmp = nodes[i]; - nodes[i] = nodes[j]; - nodes[j] = tmp; - } - - /* Link them in shuffled order */ - for (int i = 0; i < n - 1; i++) { - nodes[i]->next = nodes[i + 1]; - } - nodes[n - 1]->next = NULL; - - struct node *head = nodes[0]; - free(nodes); /* Free the pointer array, not the nodes */ - return head; -} - -void free_scattered_list(struct node *head) { - while (head != NULL) { - struct node *next = head->next; - free(head); - head = next; - } -} - -int main(void) { - printf("Comparing array vs linked list traversal (%d elements)\n\n", N); - - double start, elapsed; - long result; - - /* Array */ - printf("Creating array...\n"); - int *arr = create_array(N); - - start = get_time(); - result = sum_array(arr, N); - elapsed = get_time() - start; - printf("Array sum: %ld in %.4f seconds\n", result, elapsed); - double array_time = elapsed; - free(arr); - - /* Sequential linked list (best case for list) */ - printf("\nCreating sequential linked list...\n"); - struct node *list_seq = create_list_sequential(N); - - start = get_time(); - result = sum_list(list_seq); - elapsed = get_time() - start; - printf("List sum (sequential): %ld in %.4f seconds\n", result, elapsed); - double list_seq_time = elapsed; - free(list_seq); - - /* Scattered linked list (worst case for cache) */ - printf("\nCreating scattered linked list (this takes a while)...\n"); - struct node *list_scat = create_list_scattered(N); - - start = get_time(); - result = sum_list(list_scat); - elapsed = get_time() - start; - printf("List sum (scattered): %ld in %.4f seconds\n", result, elapsed); - double list_scat_time = elapsed; - free_scattered_list(list_scat); - - printf("\n--- Summary ---\n"); - printf("Array: %.4fs (baseline)\n", array_time); - printf("List (sequential): %.4fs (%.1fx slower)\n", - list_seq_time, list_seq_time / array_time); - printf("List (scattered): %.4fs (%.1fx slower)\n", - list_scat_time, list_scat_time / array_time); - - printf("\nTo see cache behavior:\n"); - printf(" perf stat -e cache-misses,cache-references ./list_vs_array\n"); - - return 0; -} diff --git a/scenario4-cache-misses/matrix_col_major.c b/scenario4-cache-misses/matrix_col_major.c new file mode 100644 index 0000000..384de9e --- /dev/null +++ b/scenario4-cache-misses/matrix_col_major.c @@ -0,0 +1,73 @@ +/* + * BAD: Column-Major Matrix Traversal + * =================================== + * This program traverses a 2D matrix in column-major order, + * which causes poor cache utilization because C stores arrays in row-major order. + * + * Compile: make matrix_col_major + * Run: ./matrix_col_major + * Profile: perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_col_major + */ + +#include +#include +#include + +#define ROWS 8192 +#define COLS 8192 + +/* Using static to avoid stack overflow */ +static int matrix[ROWS][COLS]; + +double get_time(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec / 1e9; +} + +void init_matrix(void) { + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + matrix[i][j] = (i + j) % 100; + } + } +} + +/* + * Column-major traversal: jump around in memory + * Access pattern: [0][0], [1][0], [2][0], ... [ROWS-1][0], [0][1], ... + * Each access is COLS * sizeof(int) bytes apart - CACHE HOSTILE + */ +long sum_col_major(void) { + long sum = 0; + for (int j = 0; j < COLS; j++) { + for (int i = 0; i < ROWS; i++) { + sum += matrix[i][j]; + } + } + return sum; +} + +int main(void) { + printf("Matrix size: %d x %d = %zu bytes\n", + ROWS, COLS, sizeof(matrix)); + printf("Cache line size (typical): 64 bytes\n"); + printf("Stride per access: %zu bytes (jumps over entire row!)\n\n", + COLS * sizeof(int)); + + init_matrix(); + + /* Warm up */ + sum_col_major(); + + double start = get_time(); + long result = sum_col_major(); + double elapsed = get_time() - start; + + printf("Column-major sum: %ld in %.3f seconds\n\n", result, elapsed); + + printf("To see cache misses, run:\n"); + printf(" perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_col_major\n"); + + return 0; +} diff --git a/scenario4-cache-misses/matrix_row_major.c b/scenario4-cache-misses/matrix_row_major.c new file mode 100644 index 0000000..67568c2 --- /dev/null +++ b/scenario4-cache-misses/matrix_row_major.c @@ -0,0 +1,73 @@ +/* + * GOOD: Row-Major Matrix Traversal + * ================================= + * This program traverses a 2D matrix in row-major order, + * matching how C stores 2D arrays in memory for excellent cache utilization. + * + * Compile: make matrix_row_major + * Run: ./matrix_row_major + * Profile: perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_row_major + */ + +#include +#include +#include + +#define ROWS 8192 +#define COLS 8192 + +/* Using static to avoid stack overflow */ +static int matrix[ROWS][COLS]; + +double get_time(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec / 1e9; +} + +void init_matrix(void) { + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + matrix[i][j] = (i + j) % 100; + } + } +} + +/* + * Row-major traversal: access sequential memory addresses + * Memory layout: [0][0], [0][1], [0][2], ... [0][COLS-1], [1][0], ... + * This matches how C stores 2D arrays - CACHE FRIENDLY + */ +long sum_row_major(void) { + long sum = 0; + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + sum += matrix[i][j]; + } + } + return sum; +} + +int main(void) { + printf("Matrix size: %d x %d = %zu bytes\n", + ROWS, COLS, sizeof(matrix)); + printf("Cache line size (typical): 64 bytes\n"); + printf("Stride per access: %zu bytes (sequential!)\n\n", + sizeof(int)); + + init_matrix(); + + /* Warm up */ + sum_row_major(); + + double start = get_time(); + long result = sum_row_major(); + double elapsed = get_time() - start; + + printf("Row-major sum: %ld in %.3f seconds\n\n", result, elapsed); + + printf("To see cache behavior, run:\n"); + printf(" perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_row_major\n"); + + return 0; +}