init

2026-01-08 18:11:30 +05:30
commit 4fb1bd90db
32 changed files with 3058 additions and 0 deletions
--- a/scenario4-cache-misses/Makefile
+++ b/scenario4-cache-misses/Makefile
@@ -0,0 +1,15 @@
+CC = gcc
+CFLAGS = -O2 -Wall
+
+all: cache_demo list_vs_array
+
+cache_demo: cache_demo.c
+	$(CC) $(CFLAGS) -o $@ $<
+
+list_vs_array: list_vs_array.c
+	$(CC) $(CFLAGS) -o $@ $<
+
+clean:
+	rm -f cache_demo list_vs_array
+
+.PHONY: all clean
--- a/scenario4-cache-misses/README.md
+++ b/scenario4-cache-misses/README.md
@@ -0,0 +1,148 @@
+# Scenario 4: Cache Misses and Memory Access Patterns
+
+## Learning Objectives
+- Understand CPU cache basics (L1, L2, L3)
+- Use `perf stat` to measure cache behavior
+- Recognize cache-friendly vs cache-hostile access patterns
+- Understand why Big-O notation doesn't tell the whole story
+
+## Background: How CPU Caches Work
+
+```
+CPU Core
+    ↓
+L1 Cache (~32KB, ~4 cycles)
+    ↓
+L2 Cache (~256KB, ~12 cycles)
+    ↓
+L3 Cache (~8MB, ~40 cycles)
+    ↓
+Main RAM (~64GB, ~200 cycles)
+```
+
+Key concepts:
+- **Cache line**: Data is loaded in chunks (typically 64 bytes)
+- **Spatial locality**: If you access byte N, bytes N+1, N+2, ... are likely already cached
+- **Temporal locality**: Recently accessed data is likely to be accessed again
+
+## Files
+- `cache_demo.c` - Row-major vs column-major 2D array traversal
+- `list_vs_array.c` - Array vs linked list traversal
+
+## Exercise 1: Row vs Column Major
+
+### Step 1: Build and run
+```bash
+make cache_demo
+./cache_demo
+```
+
+You should see column-major is significantly slower (often 3-10x).
+
+### Step 2: Measure cache misses
+```bash
+perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo
+```
+
+Compare the cache miss counts and ratios.
+
+### Why does this happen?
+
+C stores 2D arrays in **row-major** order:
+```
+Memory: [0][0] [0][1] [0][2] ... [0][COLS-1] [1][0] [1][1] ...
+         ←————— row 0 ——————→    ←—— row 1 ——→
+```
+
+**Row-major access**: Sequential in memory → cache lines are fully utilized
+```
+Access: [0][0] [0][1] [0][2] [0][3] ...
+Cache:  [████████████████] ← one cache line serves 16 ints
+```
+
+**Column-major access**: Jumping by COLS * sizeof(int) bytes each time
+```
+Access: [0][0] [1][0] [2][0] [3][0] ...
+Cache:  [█_______________] ← load entire line, use 1 int, evict
+        [█_______________] ← repeat for each access
+```
+
+## Exercise 2: Array vs Linked List
+
+### Step 1: Build and run
+```bash
+make list_vs_array
+./list_vs_array
+```
+
+### Step 2: Measure cache behavior
+```bash
+perf stat -e cache-misses,cache-references ./list_vs_array
+```
+
+### Three cases compared:
+
+| Case | Memory Layout | Cache Behavior |
+|------|---------------|----------------|
+| Array | Contiguous | Excellent - prefetcher wins |
+| List (sequential) | Contiguous (lucky!) | Good - nodes happen to be adjacent |
+| List (scattered) | Random | Terrible - every access misses |
+
+### Why "sequential list" is still slower than array:
+
+1. **Pointer chasing**: CPU can't prefetch next element (doesn't know address)
+2. **Larger elements**: `struct node` is bigger than `int` (includes pointer)
+3. **Indirect access**: Extra memory load for the `next` pointer
+
+## Exercise 3: Deeper perf Analysis
+
+### See more cache events
+```bash
+perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./cache_demo
+```
+
+Events explained:
+- `L1-dcache-*`: Level 1 data cache (fastest, smallest)
+- `LLC-*`: Last Level Cache (L3, slowest cache before RAM)
+- `cycles`: Total CPU cycles
+- `instructions`: Total instructions executed
+- IPC (instructions per cycle): Higher is better
+
+### Profile with perf record
+```bash
+perf record -e cache-misses ./cache_demo
+perf report
+```
+
+This shows which functions cause the most cache misses.
+
+## Discussion Questions
+
+1. **Why doesn't the compiler fix this?**
+   - Compilers can sometimes interchange loops, but:
+   - Side effects may prevent it
+   - Aliasing makes it unsafe to assume
+   - The programmer often knows better
+
+2. **How big does the array need to be to see this effect?**
+   - If array fits in L1 cache: No difference
+   - If array fits in L3 cache: Moderate difference
+   - If array exceeds L3 cache: Dramatic difference
+
+3. **What about multithreaded code?**
+   - False sharing: Different threads accessing same cache line
+   - Cache coherency traffic between cores
+
+## Real-World Implications
+
+- **Image processing**: Process row-by-row, not column-by-column
+- **Matrix operations**: Libraries like BLAS use cache-blocking
+- **Data structures**: Arrays often beat linked lists in practice
+- **Database design**: Row stores vs column stores
+
+## Key Takeaways
+
+1. **Memory access pattern matters as much as algorithm complexity**
+2. **Sequential access is almost always faster than random access**
+3. **Measure with `perf stat` before optimizing**
+4. **Big-O notation hides constant factors that can be 10-100x**
--- a/scenario4-cache-misses/cache_demo.c
+++ b/scenario4-cache-misses/cache_demo.c
@@ -0,0 +1,109 @@
+/*
+ * Scenario 4: Cache Misses - Memory Access Patterns
+ * ==================================================
+ * This program demonstrates the performance impact of memory access patterns.
+ * Row-major vs column-major traversal of a 2D array.
+ *
+ * Compile: gcc -O2 -o cache_demo cache_demo.c
+ * 
+ * EXERCISES:
+ * 1. Run: ./cache_demo
+ * 2. Profile: perf stat -e cache-misses,cache-references ./cache_demo
+ * 3. Why is one so much faster?
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+
+#define ROWS 8192
+#define COLS 8192
+
+/* 
+ * Global array to ensure it's not optimized away.
+ * This is a 64MB array (8192 * 8192 * sizeof(int) = 256MB if int is 4 bytes)
+ * Wait, that's too big. Let's use smaller dimensions or chars.
+ */
+
+/* Using static to avoid stack overflow */
+static int matrix[ROWS][COLS];
+
+double get_time(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec + ts.tv_nsec / 1e9;
+}
+
+long sum_row_major(void) {
+    /*
+     * Row-major traversal: access sequential memory addresses
+     * Memory layout: [0][0], [0][1], [0][2], ... [0][COLS-1], [1][0], ...
+     * This matches how C stores 2D arrays - CACHE FRIENDLY
+     */
+    long sum = 0;
+    for (int i = 0; i < ROWS; i++) {
+        for (int j = 0; j < COLS; j++) {
+            sum += matrix[i][j];
+        }
+    }
+    return sum;
+}
+
+long sum_col_major(void) {
+    /*
+     * Column-major traversal: jump around in memory
+     * Access pattern: [0][0], [1][0], [2][0], ... [ROWS-1][0], [0][1], ...
+     * Each access is COLS * sizeof(int) bytes apart - CACHE HOSTILE
+     */
+    long sum = 0;
+    for (int j = 0; j < COLS; j++) {
+        for (int i = 0; i < ROWS; i++) {
+            sum += matrix[i][j];
+        }
+    }
+    return sum;
+}
+
+void init_matrix(void) {
+    /* Initialize with some values */
+    for (int i = 0; i < ROWS; i++) {
+        for (int j = 0; j < COLS; j++) {
+            matrix[i][j] = (i + j) % 100;
+        }
+    }
+}
+
+int main(void) {
+    printf("Matrix size: %d x %d = %zu bytes\n", 
+           ROWS, COLS, sizeof(matrix));
+    printf("Cache line size (typical): 64 bytes\n");
+    printf("Stride in column-major: %zu bytes\n\n", COLS * sizeof(int));
+
+    init_matrix();
+
+    double start, elapsed;
+    long result;
+
+    /* Warm up */
+    result = sum_row_major();
+    result = sum_col_major();
+
+    /* Row-major benchmark */
+    start = get_time();
+    result = sum_row_major();
+    elapsed = get_time() - start;
+    printf("Row-major sum:    %ld in %.3f seconds\n", result, elapsed);
+
+    /* Column-major benchmark */
+    start = get_time();
+    result = sum_col_major();
+    elapsed = get_time() - start;
+    printf("Column-major sum: %ld in %.3f seconds\n", result, elapsed);
+
+    printf("\n");
+    printf("To see cache misses, run:\n");
+    printf("  perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo\n");
+
+    return 0;
+}
--- a/scenario4-cache-misses/list_vs_array.c
+++ b/scenario4-cache-misses/list_vs_array.c
@@ -0,0 +1,175 @@
+/*
+ * Scenario 4b: Array vs Linked List Traversal
+ * ============================================
+ * Arrays have excellent cache locality; linked lists do not.
+ * This demonstrates why "O(n) vs O(n)" can have very different constants.
+ *
+ * Compile: gcc -O2 -o list_vs_array list_vs_array.c
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define N 10000000  /* 10 million elements */
+
+struct node {
+    int value;
+    struct node *next;
+};
+
+double get_time(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec + ts.tv_nsec / 1e9;
+}
+
+/* Sum array elements */
+long sum_array(int *arr, int n) {
+    long sum = 0;
+    for (int i = 0; i < n; i++) {
+        sum += arr[i];
+    }
+    return sum;
+}
+
+/* Sum linked list elements */
+long sum_list(struct node *head) {
+    long sum = 0;
+    struct node *curr = head;
+    while (curr != NULL) {
+        sum += curr->value;
+        curr = curr->next;
+    }
+    return sum;
+}
+
+/* Create array */
+int *create_array(int n) {
+    int *arr = malloc(n * sizeof(int));
+    if (!arr) {
+        perror("malloc array");
+        exit(1);
+    }
+    for (int i = 0; i < n; i++) {
+        arr[i] = i % 100;
+    }
+    return arr;
+}
+
+/* Create linked list - nodes allocated sequentially (best case for list) */
+struct node *create_list_sequential(int n) {
+    struct node *nodes = malloc(n * sizeof(struct node));
+    if (!nodes) {
+        perror("malloc list");
+        exit(1);
+    }
+    
+    for (int i = 0; i < n - 1; i++) {
+        nodes[i].value = i % 100;
+        nodes[i].next = &nodes[i + 1];
+    }
+    nodes[n - 1].value = (n - 1) % 100;
+    nodes[n - 1].next = NULL;
+    
+    return nodes;
+}
+
+/* Create linked list - nodes allocated randomly (worst case for cache) */
+struct node *create_list_scattered(int n) {
+    /* Allocate nodes individually to scatter them in memory */
+    struct node **nodes = malloc(n * sizeof(struct node *));
+    if (!nodes) {
+        perror("malloc");
+        exit(1);
+    }
+    
+    /* Allocate each node separately */
+    for (int i = 0; i < n; i++) {
+        nodes[i] = malloc(sizeof(struct node));
+        if (!nodes[i]) {
+            perror("malloc node");
+            exit(1);
+        }
+        nodes[i]->value = i % 100;
+    }
+    
+    /* Shuffle the order (Fisher-Yates) */
+    srand(42);
+    for (int i = n - 1; i > 0; i--) {
+        int j = rand() % (i + 1);
+        struct node *tmp = nodes[i];
+        nodes[i] = nodes[j];
+        nodes[j] = tmp;
+    }
+    
+    /* Link them in shuffled order */
+    for (int i = 0; i < n - 1; i++) {
+        nodes[i]->next = nodes[i + 1];
+    }
+    nodes[n - 1]->next = NULL;
+    
+    struct node *head = nodes[0];
+    free(nodes);  /* Free the pointer array, not the nodes */
+    return head;
+}
+
+void free_scattered_list(struct node *head) {
+    while (head != NULL) {
+        struct node *next = head->next;
+        free(head);
+        head = next;
+    }
+}
+
+int main(void) {
+    printf("Comparing array vs linked list traversal (%d elements)\n\n", N);
+
+    double start, elapsed;
+    long result;
+
+    /* Array */
+    printf("Creating array...\n");
+    int *arr = create_array(N);
+    
+    start = get_time();
+    result = sum_array(arr, N);
+    elapsed = get_time() - start;
+    printf("Array sum:             %ld in %.4f seconds\n", result, elapsed);
+    double array_time = elapsed;
+    free(arr);
+
+    /* Sequential linked list (best case for list) */
+    printf("\nCreating sequential linked list...\n");
+    struct node *list_seq = create_list_sequential(N);
+    
+    start = get_time();
+    result = sum_list(list_seq);
+    elapsed = get_time() - start;
+    printf("List sum (sequential): %ld in %.4f seconds\n", result, elapsed);
+    double list_seq_time = elapsed;
+    free(list_seq);
+
+    /* Scattered linked list (worst case for cache) */
+    printf("\nCreating scattered linked list (this takes a while)...\n");
+    struct node *list_scat = create_list_scattered(N);
+    
+    start = get_time();
+    result = sum_list(list_scat);
+    elapsed = get_time() - start;
+    printf("List sum (scattered):  %ld in %.4f seconds\n", result, elapsed);
+    double list_scat_time = elapsed;
+    free_scattered_list(list_scat);
+
+    printf("\n--- Summary ---\n");
+    printf("Array:              %.4fs (baseline)\n", array_time);
+    printf("List (sequential):  %.4fs (%.1fx slower)\n", 
+           list_seq_time, list_seq_time / array_time);
+    printf("List (scattered):   %.4fs (%.1fx slower)\n", 
+           list_scat_time, list_scat_time / array_time);
+
+    printf("\nTo see cache behavior:\n");
+    printf("  perf stat -e cache-misses,cache-references ./list_vs_array\n");
+
+    return 0;
+}