scenario 4: break down into multiple files

2026-01-11 06:39:27 +05:30
parent 25f47e017d
commit 51ab2ed553
9 changed files with 504 additions and 310 deletions
--- a/scenario4-cache-misses/Makefile
+++ b/scenario4-cache-misses/Makefile
@@ -1,15 +1,14 @@
 CC = gcc
 CFLAGS = -O2 -Wall

-all: cache_demo list_vs_array
+TARGETS = matrix_col_major matrix_row_major list_scattered list_sequential array_sum

-cache_demo: cache_demo.c
-	$(CC) $(CFLAGS) -o $@ $<
+all: $(TARGETS)

-list_vs_array: list_vs_array.c
+%: %.c
 	$(CC) $(CFLAGS) -o $@ $<

 clean:
-	rm -f cache_demo list_vs_array
+	rm -f $(TARGETS)

 .PHONY: all clean
--- a/scenario4-cache-misses/README.md
+++ b/scenario4-cache-misses/README.md
@@ -26,25 +26,49 @@ Key concepts:
 - **Temporal locality**: Recently accessed data is likely to be accessed again

 ## Files
- `cache_demo.c` - Row-major vs column-major 2D array traversal
- `list_vs_array.c` - Array vs linked list traversal
+- `matrix_col_major.c` - BAD: Column-major traversal (cache-hostile)
+- `matrix_row_major.c` - GOOD: Row-major traversal (cache-friendly)
+- `list_scattered.c` - BAD: Scattered linked list (worst cache behavior)
+- `list_sequential.c` - MEDIUM: Sequential linked list (better, but still has overhead)
+- `array_sum.c` - GOOD: Contiguous array (best cache behavior)

-## Exercise 1: Row vs Column Major
+## Setup

-### Step 1: Build and run
 ```bash
-make cache_demo
-./cache_demo
+make all
 ```

-You should see column-major is significantly slower (often 3-10x).
+---

-### Step 2: Measure cache misses
+## Exercise 1: Row-Major vs Column-Major Matrix Traversal
+
+### Step 1: Run the BAD version (column-major)
 ```bash
-perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo
+./matrix_col_major
 ```

-Compare the cache miss counts and ratios.
+Note the execution time.
+
+### Step 2: Profile to identify the issue
+```bash
+perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_col_major
+```
+
+Observe the high cache miss rate and count.
+
+### Step 3: Run the GOOD version (row-major)
+```bash
+./matrix_row_major
+```
+
+This should be significantly faster (often 3-10x).
+
+### Step 4: Profile to confirm the improvement
+```bash
+perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_row_major
+```
+
+Compare the cache miss counts and ratios with the column-major version.

 ### Why does this happen?

@@ -67,20 +91,51 @@ Cache:  [█_______________] ← load entire line, use 1 int, evict
        [█_______________] ← repeat for each access
 ```

-## Exercise 2: Array vs Linked List
+---

-### Step 1: Build and run
+## Exercise 2: Data Structure Memory Layout
+
+### Step 1: Run the WORST case (scattered linked list)
 ```bash
-make list_vs_array
-./list_vs_array
+./list_scattered
 ```

-### Step 2: Measure cache behavior
+Note the execution time - this is the worst case.
+
+### Step 2: Profile the cache behavior
 ```bash
-perf stat -e cache-misses,cache-references ./list_vs_array
+perf stat -e cache-misses,cache-references ./list_scattered
 ```

-### Three cases compared:
+Observe the terrible cache miss rate due to random memory access.
+
+### Step 3: First improvement - sequential allocation
+```bash
+./list_sequential
+```
+
+This should be faster than scattered, as nodes are contiguous in memory.
+
+### Step 4: Profile the improvement
+```bash
+perf stat -e cache-misses,cache-references ./list_sequential
+```
+
+Cache behavior improves, but still not optimal due to pointer chasing.
+
+### Step 5: Best solution - contiguous array
+```bash
+./array_sum
+```
+
+This should be the fastest by a significant margin.
+
+### Step 6: Profile the optimal case
+```bash
+perf stat -e cache-misses,cache-references ./array_sum
+```
+
+Compare all three cache miss counts:

 | Case | Memory Layout | Cache Behavior |
 |------|---------------|----------------|
@@ -88,17 +143,22 @@ perf stat -e cache-misses,cache-references ./list_vs_array
 | List (sequential) | Contiguous (lucky!) | Good - nodes happen to be adjacent |
 | List (scattered) | Random | Terrible - every access misses |

-### Why "sequential list" is still slower than array:
+### Why linked lists are slow

-1. **Pointer chasing**: CPU can't prefetch next element (doesn't know address)
+Even with sequential allocation, linked lists are slower than arrays:
+
+1. **Pointer chasing**: CPU can't prefetch next element (doesn't know address until current node is loaded)
 2. **Larger elements**: `struct node` is bigger than `int` (includes pointer)
 3. **Indirect access**: Extra memory load for the `next` pointer

+---
+
 ## Exercise 3: Deeper perf Analysis

 ### See more cache events
 ```bash
-perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./cache_demo
+perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./matrix_col_major
+perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./matrix_row_major
 ```

 Events explained:
@@ -110,12 +170,14 @@ Events explained:

 ### Profile with perf record
 ```bash
-perf record -e cache-misses ./cache_demo
+perf record -e cache-misses ./matrix_col_major
 perf report
 ```

 This shows which functions cause the most cache misses.

+---
+
 ## Discussion Questions

 1. **Why doesn't the compiler fix this?**
--- a/scenario4-cache-misses/array_sum.c
+++ b/scenario4-cache-misses/array_sum.c
@@ -0,0 +1,65 @@
+/*
+ * GOOD: Contiguous Array Traversal
+ * =================================
+ * This program uses a contiguous array for excellent cache locality.
+ * The CPU prefetcher can predict sequential access patterns.
+ *
+ * Compile: make array_sum
+ * Run:     ./array_sum
+ * Profile: perf stat -e cache-misses,cache-references ./array_sum
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define N 10000000  /* 10 million elements */
+
+double get_time(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec + ts.tv_nsec / 1e9;
+}
+
+long sum_array(int *arr, int n) {
+    long sum = 0;
+    for (int i = 0; i < n; i++) {
+        sum += arr[i];
+    }
+    return sum;
+}
+
+int *create_array(int n) {
+    int *arr = malloc(n * sizeof(int));
+    if (!arr) {
+        perror("malloc array");
+        exit(1);
+    }
+    for (int i = 0; i < n; i++) {
+        arr[i] = i % 100;
+    }
+    return arr;
+}
+
+int main(void) {
+    printf("Contiguous Array Traversal (%d elements)\n", N);
+    printf("Sequential memory access - CPU prefetcher works perfectly.\n\n");
+
+    printf("Creating array...\n");
+    int *arr = create_array(N);
+
+    /* Warm up */
+    sum_array(arr, N);
+
+    double start = get_time();
+    long result = sum_array(arr, N);
+    double elapsed = get_time() - start;
+
+    printf("Array sum: %ld in %.4f seconds\n\n", result, elapsed);
+
+    printf("To see cache behavior, run:\n");
+    printf("  perf stat -e cache-misses,cache-references ./array_sum\n");
+
+    free(arr);
+    return 0;
+}
--- a/scenario4-cache-misses/cache_demo.c
+++ b/scenario4-cache-misses/cache_demo.c
@@ -1,109 +0,0 @@
-/*
- * Scenario 4: Cache Misses - Memory Access Patterns
- * ==================================================
- * This program demonstrates the performance impact of memory access patterns.
- * Row-major vs column-major traversal of a 2D array.
- *
- * Compile: gcc -O2 -o cache_demo cache_demo.c
- * 
- * EXERCISES:
- * 1. Run: ./cache_demo
- * 2. Profile: perf stat -e cache-misses,cache-references ./cache_demo
- * 3. Why is one so much faster?
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <string.h>
-
-#define ROWS 8192
-#define COLS 8192
-
-/* 
- * Global array to ensure it's not optimized away.
- * This is a 64MB array (8192 * 8192 * sizeof(int) = 256MB if int is 4 bytes)
- * Wait, that's too big. Let's use smaller dimensions or chars.
- */
-
-/* Using static to avoid stack overflow */
-static int matrix[ROWS][COLS];
-
-double get_time(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return ts.tv_sec + ts.tv_nsec / 1e9;
-}
-
-long sum_row_major(void) {
-    /*
-     * Row-major traversal: access sequential memory addresses
-     * Memory layout: [0][0], [0][1], [0][2], ... [0][COLS-1], [1][0], ...
-     * This matches how C stores 2D arrays - CACHE FRIENDLY
-     */
-    long sum = 0;
-    for (int i = 0; i < ROWS; i++) {
-        for (int j = 0; j < COLS; j++) {
-            sum += matrix[i][j];
-        }
-    }
-    return sum;
-}
-
-long sum_col_major(void) {
-    /*
-     * Column-major traversal: jump around in memory
-     * Access pattern: [0][0], [1][0], [2][0], ... [ROWS-1][0], [0][1], ...
-     * Each access is COLS * sizeof(int) bytes apart - CACHE HOSTILE
-     */
-    long sum = 0;
-    for (int j = 0; j < COLS; j++) {
-        for (int i = 0; i < ROWS; i++) {
-            sum += matrix[i][j];
-        }
-    }
-    return sum;
-}
-
-void init_matrix(void) {
-    /* Initialize with some values */
-    for (int i = 0; i < ROWS; i++) {
-        for (int j = 0; j < COLS; j++) {
-            matrix[i][j] = (i + j) % 100;
-        }
-    }
-}
-
-int main(void) {
-    printf("Matrix size: %d x %d = %zu bytes\n", 
-           ROWS, COLS, sizeof(matrix));
-    printf("Cache line size (typical): 64 bytes\n");
-    printf("Stride in column-major: %zu bytes\n\n", COLS * sizeof(int));
-
-    init_matrix();
-
-    double start, elapsed;
-    long result;
-
-    /* Warm up */
-    result = sum_row_major();
-    result = sum_col_major();
-
-    /* Row-major benchmark */
-    start = get_time();
-    result = sum_row_major();
-    elapsed = get_time() - start;
-    printf("Row-major sum:    %ld in %.3f seconds\n", result, elapsed);
-
-    /* Column-major benchmark */
-    start = get_time();
-    result = sum_col_major();
-    elapsed = get_time() - start;
-    printf("Column-major sum: %ld in %.3f seconds\n", result, elapsed);
-
-    printf("\n");
-    printf("To see cache misses, run:\n");
-    printf("  perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo\n");
-
-    return 0;
-}
--- a/scenario4-cache-misses/list_scattered.c
+++ b/scenario4-cache-misses/list_scattered.c
@@ -0,0 +1,123 @@
+/*
+ * BAD: Scattered Linked List Traversal
+ * =====================================
+ * This program creates a linked list with nodes scattered randomly in memory,
+ * simulating real-world fragmented allocation patterns.
+ * This causes terrible cache behavior due to random memory access.
+ *
+ * Compile: make list_scattered
+ * Run:     ./list_scattered
+ * Profile: perf stat -e cache-misses,cache-references ./list_scattered
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <time.h>
+
+#define N 10000000  /* 10 million elements */
+
+struct node {
+    int value;
+    struct node *next;
+};
+
+double get_time(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec + ts.tv_nsec / 1e9;
+}
+
+/* Fast deterministic PRNG - much faster than rand() */
+static uint64_t xorshift64_state = 42;
+
+static inline uint64_t xorshift64(void) {
+    uint64_t x = xorshift64_state;
+    x ^= x << 13;
+    x ^= x >> 7;
+    x ^= x << 17;
+    xorshift64_state = x;
+    return x;
+}
+
+long sum_list(struct node *head) {
+    long sum = 0;
+    struct node *curr = head;
+    while (curr != NULL) {
+        sum += curr->value;
+        curr = curr->next;
+    }
+    return sum;
+}
+
+/*
+ * Create linked list with nodes scattered in memory (worst case for cache)
+ * Each node is allocated individually, then shuffled and linked randomly.
+ */
+struct node *create_list_scattered(int n) {
+    struct node **nodes = malloc(n * sizeof(struct node *));
+    if (!nodes) {
+        perror("malloc");
+        exit(1);
+    }
+
+    /* Allocate each node separately - they end up scattered in heap */
+    for (int i = 0; i < n; i++) {
+        nodes[i] = malloc(sizeof(struct node));
+        if (!nodes[i]) {
+            perror("malloc node");
+            exit(1);
+        }
+        nodes[i]->value = i % 100;
+    }
+
+    /* Shuffle the order (Fisher-Yates) to ensure random access pattern */
+    for (int i = n - 1; i > 0; i--) {
+        int j = xorshift64() % (i + 1);
+        struct node *tmp = nodes[i];
+        nodes[i] = nodes[j];
+        nodes[j] = tmp;
+    }
+
+    /* Link them in shuffled order */
+    for (int i = 0; i < n - 1; i++) {
+        nodes[i]->next = nodes[i + 1];
+    }
+    nodes[n - 1]->next = NULL;
+
+    struct node *head = nodes[0];
+    free(nodes);  /* Free the pointer array, not the nodes */
+    return head;
+}
+
+void free_scattered_list(struct node *head) {
+    while (head != NULL) {
+        struct node *next = head->next;
+        free(head);
+        head = next;
+    }
+}
+
+int main(void) {
+    printf("Scattered Linked List Traversal (%d elements)\n", N);
+    printf("Each node allocated individually, then linked in random order.\n");
+    printf("This causes maximum cache thrashing.\n\n");
+
+    printf("Creating scattered linked list (this takes a while)...\n");
+    struct node *list = create_list_scattered(N);
+
+    /* Warm up */
+    sum_list(list);
+
+    double start = get_time();
+    long result = sum_list(list);
+    double elapsed = get_time() - start;
+
+    printf("Scattered list sum: %ld in %.4f seconds\n\n", result, elapsed);
+
+    printf("To see cache behavior, run:\n");
+    printf("  perf stat -e cache-misses,cache-references ./list_scattered\n");
+
+    free_scattered_list(list);
+    return 0;
+}
--- a/scenario4-cache-misses/list_sequential.c
+++ b/scenario4-cache-misses/list_sequential.c
@@ -0,0 +1,83 @@
+/*
+ * MEDIUM: Sequential Linked List Traversal
+ * =========================================
+ * This program creates a linked list with nodes allocated contiguously,
+ * representing the best-case scenario for linked lists.
+ * Still slower than arrays due to pointer chasing overhead.
+ *
+ * Compile: make list_sequential
+ * Run:     ./list_sequential
+ * Profile: perf stat -e cache-misses,cache-references ./list_sequential
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define N 10000000  /* 10 million elements */
+
+struct node {
+    int value;
+    struct node *next;
+};
+
+double get_time(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec + ts.tv_nsec / 1e9;
+}
+
+long sum_list(struct node *head) {
+    long sum = 0;
+    struct node *curr = head;
+    while (curr != NULL) {
+        sum += curr->value;
+        curr = curr->next;
+    }
+    return sum;
+}
+
+/*
+ * Create linked list with nodes allocated sequentially (best case for list)
+ * All nodes allocated in one contiguous block, linked in order.
+ */
+struct node *create_list_sequential(int n) {
+    struct node *nodes = malloc(n * sizeof(struct node));
+    if (!nodes) {
+        perror("malloc list");
+        exit(1);
+    }
+
+    for (int i = 0; i < n - 1; i++) {
+        nodes[i].value = i % 100;
+        nodes[i].next = &nodes[i + 1];
+    }
+    nodes[n - 1].value = (n - 1) % 100;
+    nodes[n - 1].next = NULL;
+
+    return nodes;
+}
+
+int main(void) {
+    printf("Sequential Linked List Traversal (%d elements)\n", N);
+    printf("All nodes allocated contiguously - best case for linked list.\n");
+    printf("Still has pointer chasing overhead vs array.\n\n");
+
+    printf("Creating sequential linked list...\n");
+    struct node *list = create_list_sequential(N);
+
+    /* Warm up */
+    sum_list(list);
+
+    double start = get_time();
+    long result = sum_list(list);
+    double elapsed = get_time() - start;
+
+    printf("Sequential list sum: %ld in %.4f seconds\n\n", result, elapsed);
+
+    printf("To see cache behavior, run:\n");
+    printf("  perf stat -e cache-misses,cache-references ./list_sequential\n");
+
+    free(list);
+    return 0;
+}
--- a/scenario4-cache-misses/list_vs_array.c
+++ b/scenario4-cache-misses/list_vs_array.c
@@ -1,175 +0,0 @@
-/*
- * Scenario 4b: Array vs Linked List Traversal
- * ============================================
- * Arrays have excellent cache locality; linked lists do not.
- * This demonstrates why "O(n) vs O(n)" can have very different constants.
- *
- * Compile: gcc -O2 -o list_vs_array list_vs_array.c
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-
-#define N 10000000  /* 10 million elements */
-
-struct node {
-    int value;
-    struct node *next;
-};
-
-double get_time(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return ts.tv_sec + ts.tv_nsec / 1e9;
-}
-
-/* Sum array elements */
-long sum_array(int *arr, int n) {
-    long sum = 0;
-    for (int i = 0; i < n; i++) {
-        sum += arr[i];
-    }
-    return sum;
-}
-
-/* Sum linked list elements */
-long sum_list(struct node *head) {
-    long sum = 0;
-    struct node *curr = head;
-    while (curr != NULL) {
-        sum += curr->value;
-        curr = curr->next;
-    }
-    return sum;
-}
-
-/* Create array */
-int *create_array(int n) {
-    int *arr = malloc(n * sizeof(int));
-    if (!arr) {
-        perror("malloc array");
-        exit(1);
-    }
-    for (int i = 0; i < n; i++) {
-        arr[i] = i % 100;
-    }
-    return arr;
-}
-
-/* Create linked list - nodes allocated sequentially (best case for list) */
-struct node *create_list_sequential(int n) {
-    struct node *nodes = malloc(n * sizeof(struct node));
-    if (!nodes) {
-        perror("malloc list");
-        exit(1);
-    }
-    
-    for (int i = 0; i < n - 1; i++) {
-        nodes[i].value = i % 100;
-        nodes[i].next = &nodes[i + 1];
-    }
-    nodes[n - 1].value = (n - 1) % 100;
-    nodes[n - 1].next = NULL;
-    
-    return nodes;
-}
-
-/* Create linked list - nodes allocated randomly (worst case for cache) */
-struct node *create_list_scattered(int n) {
-    /* Allocate nodes individually to scatter them in memory */
-    struct node **nodes = malloc(n * sizeof(struct node *));
-    if (!nodes) {
-        perror("malloc");
-        exit(1);
-    }
-    
-    /* Allocate each node separately */
-    for (int i = 0; i < n; i++) {
-        nodes[i] = malloc(sizeof(struct node));
-        if (!nodes[i]) {
-            perror("malloc node");
-            exit(1);
-        }
-        nodes[i]->value = i % 100;
-    }
-    
-    /* Shuffle the order (Fisher-Yates) */
-    srand(42);
-    for (int i = n - 1; i > 0; i--) {
-        int j = rand() % (i + 1);
-        struct node *tmp = nodes[i];
-        nodes[i] = nodes[j];
-        nodes[j] = tmp;
-    }
-    
-    /* Link them in shuffled order */
-    for (int i = 0; i < n - 1; i++) {
-        nodes[i]->next = nodes[i + 1];
-    }
-    nodes[n - 1]->next = NULL;
-    
-    struct node *head = nodes[0];
-    free(nodes);  /* Free the pointer array, not the nodes */
-    return head;
-}
-
-void free_scattered_list(struct node *head) {
-    while (head != NULL) {
-        struct node *next = head->next;
-        free(head);
-        head = next;
-    }
-}
-
-int main(void) {
-    printf("Comparing array vs linked list traversal (%d elements)\n\n", N);
-
-    double start, elapsed;
-    long result;
-
-    /* Array */
-    printf("Creating array...\n");
-    int *arr = create_array(N);
-    
-    start = get_time();
-    result = sum_array(arr, N);
-    elapsed = get_time() - start;
-    printf("Array sum:             %ld in %.4f seconds\n", result, elapsed);
-    double array_time = elapsed;
-    free(arr);
-
-    /* Sequential linked list (best case for list) */
-    printf("\nCreating sequential linked list...\n");
-    struct node *list_seq = create_list_sequential(N);
-    
-    start = get_time();
-    result = sum_list(list_seq);
-    elapsed = get_time() - start;
-    printf("List sum (sequential): %ld in %.4f seconds\n", result, elapsed);
-    double list_seq_time = elapsed;
-    free(list_seq);
-
-    /* Scattered linked list (worst case for cache) */
-    printf("\nCreating scattered linked list (this takes a while)...\n");
-    struct node *list_scat = create_list_scattered(N);
-    
-    start = get_time();
-    result = sum_list(list_scat);
-    elapsed = get_time() - start;
-    printf("List sum (scattered):  %ld in %.4f seconds\n", result, elapsed);
-    double list_scat_time = elapsed;
-    free_scattered_list(list_scat);
-
-    printf("\n--- Summary ---\n");
-    printf("Array:              %.4fs (baseline)\n", array_time);
-    printf("List (sequential):  %.4fs (%.1fx slower)\n", 
-           list_seq_time, list_seq_time / array_time);
-    printf("List (scattered):   %.4fs (%.1fx slower)\n", 
-           list_scat_time, list_scat_time / array_time);
-
-    printf("\nTo see cache behavior:\n");
-    printf("  perf stat -e cache-misses,cache-references ./list_vs_array\n");
-
-    return 0;
-}
--- a/scenario4-cache-misses/matrix_col_major.c
+++ b/scenario4-cache-misses/matrix_col_major.c
@@ -0,0 +1,73 @@
+/*
+ * BAD: Column-Major Matrix Traversal
+ * ===================================
+ * This program traverses a 2D matrix in column-major order,
+ * which causes poor cache utilization because C stores arrays in row-major order.
+ *
+ * Compile: make matrix_col_major
+ * Run:     ./matrix_col_major
+ * Profile: perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_col_major
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ROWS 8192
+#define COLS 8192
+
+/* Using static to avoid stack overflow */
+static int matrix[ROWS][COLS];
+
+double get_time(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec + ts.tv_nsec / 1e9;
+}
+
+void init_matrix(void) {
+    for (int i = 0; i < ROWS; i++) {
+        for (int j = 0; j < COLS; j++) {
+            matrix[i][j] = (i + j) % 100;
+        }
+    }
+}
+
+/*
+ * Column-major traversal: jump around in memory
+ * Access pattern: [0][0], [1][0], [2][0], ... [ROWS-1][0], [0][1], ...
+ * Each access is COLS * sizeof(int) bytes apart - CACHE HOSTILE
+ */
+long sum_col_major(void) {
+    long sum = 0;
+    for (int j = 0; j < COLS; j++) {
+        for (int i = 0; i < ROWS; i++) {
+            sum += matrix[i][j];
+        }
+    }
+    return sum;
+}
+
+int main(void) {
+    printf("Matrix size: %d x %d = %zu bytes\n",
+           ROWS, COLS, sizeof(matrix));
+    printf("Cache line size (typical): 64 bytes\n");
+    printf("Stride per access: %zu bytes (jumps over entire row!)\n\n",
+           COLS * sizeof(int));
+
+    init_matrix();
+
+    /* Warm up */
+    sum_col_major();
+
+    double start = get_time();
+    long result = sum_col_major();
+    double elapsed = get_time() - start;
+
+    printf("Column-major sum: %ld in %.3f seconds\n\n", result, elapsed);
+
+    printf("To see cache misses, run:\n");
+    printf("  perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_col_major\n");
+
+    return 0;
+}
--- a/scenario4-cache-misses/matrix_row_major.c
+++ b/scenario4-cache-misses/matrix_row_major.c
@@ -0,0 +1,73 @@
+/*
+ * GOOD: Row-Major Matrix Traversal
+ * =================================
+ * This program traverses a 2D matrix in row-major order,
+ * matching how C stores 2D arrays in memory for excellent cache utilization.
+ *
+ * Compile: make matrix_row_major
+ * Run:     ./matrix_row_major
+ * Profile: perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_row_major
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ROWS 8192
+#define COLS 8192
+
+/* Using static to avoid stack overflow */
+static int matrix[ROWS][COLS];
+
+double get_time(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec + ts.tv_nsec / 1e9;
+}
+
+void init_matrix(void) {
+    for (int i = 0; i < ROWS; i++) {
+        for (int j = 0; j < COLS; j++) {
+            matrix[i][j] = (i + j) % 100;
+        }
+    }
+}
+
+/*
+ * Row-major traversal: access sequential memory addresses
+ * Memory layout: [0][0], [0][1], [0][2], ... [0][COLS-1], [1][0], ...
+ * This matches how C stores 2D arrays - CACHE FRIENDLY
+ */
+long sum_row_major(void) {
+    long sum = 0;
+    for (int i = 0; i < ROWS; i++) {
+        for (int j = 0; j < COLS; j++) {
+            sum += matrix[i][j];
+        }
+    }
+    return sum;
+}
+
+int main(void) {
+    printf("Matrix size: %d x %d = %zu bytes\n",
+           ROWS, COLS, sizeof(matrix));
+    printf("Cache line size (typical): 64 bytes\n");
+    printf("Stride per access: %zu bytes (sequential!)\n\n",
+           sizeof(int));
+
+    init_matrix();
+
+    /* Warm up */
+    sum_row_major();
+
+    double start = get_time();
+    long result = sum_row_major();
+    double elapsed = get_time() - start;
+
+    printf("Row-major sum: %ld in %.3f seconds\n\n", result, elapsed);
+
+    printf("To see cache behavior, run:\n");
+    printf("  perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_row_major\n");
+
+    return 0;
+}