scenario 4: break down into multiple files
This commit is contained in:
parent
25f47e017d
commit
51ab2ed553
@ -1,15 +1,14 @@
|
|||||||
CC = gcc
|
CC = gcc
|
||||||
CFLAGS = -O2 -Wall
|
CFLAGS = -O2 -Wall
|
||||||
|
|
||||||
all: cache_demo list_vs_array
|
TARGETS = matrix_col_major matrix_row_major list_scattered list_sequential array_sum
|
||||||
|
|
||||||
cache_demo: cache_demo.c
|
all: $(TARGETS)
|
||||||
$(CC) $(CFLAGS) -o $@ $<
|
|
||||||
|
|
||||||
list_vs_array: list_vs_array.c
|
%: %.c
|
||||||
$(CC) $(CFLAGS) -o $@ $<
|
$(CC) $(CFLAGS) -o $@ $<
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f cache_demo list_vs_array
|
rm -f $(TARGETS)
|
||||||
|
|
||||||
.PHONY: all clean
|
.PHONY: all clean
|
||||||
|
|||||||
@ -26,25 +26,49 @@ Key concepts:
|
|||||||
- **Temporal locality**: Recently accessed data is likely to be accessed again
|
- **Temporal locality**: Recently accessed data is likely to be accessed again
|
||||||
|
|
||||||
## Files
|
## Files
|
||||||
- `cache_demo.c` - Row-major vs column-major 2D array traversal
|
- `matrix_col_major.c` - BAD: Column-major traversal (cache-hostile)
|
||||||
- `list_vs_array.c` - Array vs linked list traversal
|
- `matrix_row_major.c` - GOOD: Row-major traversal (cache-friendly)
|
||||||
|
- `list_scattered.c` - BAD: Scattered linked list (worst cache behavior)
|
||||||
|
- `list_sequential.c` - MEDIUM: Sequential linked list (better, but still has overhead)
|
||||||
|
- `array_sum.c` - GOOD: Contiguous array (best cache behavior)
|
||||||
|
|
||||||
## Exercise 1: Row vs Column Major
|
## Setup
|
||||||
|
|
||||||
### Step 1: Build and run
|
|
||||||
```bash
|
```bash
|
||||||
make cache_demo
|
make all
|
||||||
./cache_demo
|
|
||||||
```
|
```
|
||||||
|
|
||||||
You should see column-major is significantly slower (often 3-10x).
|
---
|
||||||
|
|
||||||
### Step 2: Measure cache misses
|
## Exercise 1: Row-Major vs Column-Major Matrix Traversal
|
||||||
|
|
||||||
|
### Step 1: Run the BAD version (column-major)
|
||||||
```bash
|
```bash
|
||||||
perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo
|
./matrix_col_major
|
||||||
```
|
```
|
||||||
|
|
||||||
Compare the cache miss counts and ratios.
|
Note the execution time.
|
||||||
|
|
||||||
|
### Step 2: Profile to identify the issue
|
||||||
|
```bash
|
||||||
|
perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_col_major
|
||||||
|
```
|
||||||
|
|
||||||
|
Observe the high cache miss rate and count.
|
||||||
|
|
||||||
|
### Step 3: Run the GOOD version (row-major)
|
||||||
|
```bash
|
||||||
|
./matrix_row_major
|
||||||
|
```
|
||||||
|
|
||||||
|
This should be significantly faster (often 3-10x).
|
||||||
|
|
||||||
|
### Step 4: Profile to confirm the improvement
|
||||||
|
```bash
|
||||||
|
perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_row_major
|
||||||
|
```
|
||||||
|
|
||||||
|
Compare the cache miss counts and ratios with the column-major version.
|
||||||
|
|
||||||
### Why does this happen?
|
### Why does this happen?
|
||||||
|
|
||||||
@ -67,20 +91,51 @@ Cache: [█_______________] ← load entire line, use 1 int, evict
|
|||||||
[█_______________] ← repeat for each access
|
[█_______________] ← repeat for each access
|
||||||
```
|
```
|
||||||
|
|
||||||
## Exercise 2: Array vs Linked List
|
---
|
||||||
|
|
||||||
### Step 1: Build and run
|
## Exercise 2: Data Structure Memory Layout
|
||||||
|
|
||||||
|
### Step 1: Run the WORST case (scattered linked list)
|
||||||
```bash
|
```bash
|
||||||
make list_vs_array
|
./list_scattered
|
||||||
./list_vs_array
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 2: Measure cache behavior
|
Note the execution time - this is the worst case.
|
||||||
|
|
||||||
|
### Step 2: Profile the cache behavior
|
||||||
```bash
|
```bash
|
||||||
perf stat -e cache-misses,cache-references ./list_vs_array
|
perf stat -e cache-misses,cache-references ./list_scattered
|
||||||
```
|
```
|
||||||
|
|
||||||
### Three cases compared:
|
Observe the terrible cache miss rate due to random memory access.
|
||||||
|
|
||||||
|
### Step 3: First improvement - sequential allocation
|
||||||
|
```bash
|
||||||
|
./list_sequential
|
||||||
|
```
|
||||||
|
|
||||||
|
This should be faster than scattered, as nodes are contiguous in memory.
|
||||||
|
|
||||||
|
### Step 4: Profile the improvement
|
||||||
|
```bash
|
||||||
|
perf stat -e cache-misses,cache-references ./list_sequential
|
||||||
|
```
|
||||||
|
|
||||||
|
Cache behavior improves, but still not optimal due to pointer chasing.
|
||||||
|
|
||||||
|
### Step 5: Best solution - contiguous array
|
||||||
|
```bash
|
||||||
|
./array_sum
|
||||||
|
```
|
||||||
|
|
||||||
|
This should be the fastest by a significant margin.
|
||||||
|
|
||||||
|
### Step 6: Profile the optimal case
|
||||||
|
```bash
|
||||||
|
perf stat -e cache-misses,cache-references ./array_sum
|
||||||
|
```
|
||||||
|
|
||||||
|
Compare all three cache miss counts:
|
||||||
|
|
||||||
| Case | Memory Layout | Cache Behavior |
|
| Case | Memory Layout | Cache Behavior |
|
||||||
|------|---------------|----------------|
|
|------|---------------|----------------|
|
||||||
@ -88,17 +143,22 @@ perf stat -e cache-misses,cache-references ./list_vs_array
|
|||||||
| List (sequential) | Contiguous (lucky!) | Good - nodes happen to be adjacent |
|
| List (sequential) | Contiguous (lucky!) | Good - nodes happen to be adjacent |
|
||||||
| List (scattered) | Random | Terrible - every access misses |
|
| List (scattered) | Random | Terrible - every access misses |
|
||||||
|
|
||||||
### Why "sequential list" is still slower than array:
|
### Why linked lists are slow
|
||||||
|
|
||||||
1. **Pointer chasing**: CPU can't prefetch next element (doesn't know address)
|
Even with sequential allocation, linked lists are slower than arrays:
|
||||||
|
|
||||||
|
1. **Pointer chasing**: CPU can't prefetch next element (doesn't know address until current node is loaded)
|
||||||
2. **Larger elements**: `struct node` is bigger than `int` (includes pointer)
|
2. **Larger elements**: `struct node` is bigger than `int` (includes pointer)
|
||||||
3. **Indirect access**: Extra memory load for the `next` pointer
|
3. **Indirect access**: Extra memory load for the `next` pointer
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Exercise 3: Deeper perf Analysis
|
## Exercise 3: Deeper perf Analysis
|
||||||
|
|
||||||
### See more cache events
|
### See more cache events
|
||||||
```bash
|
```bash
|
||||||
perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./cache_demo
|
perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./matrix_col_major
|
||||||
|
perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./matrix_row_major
|
||||||
```
|
```
|
||||||
|
|
||||||
Events explained:
|
Events explained:
|
||||||
@ -110,12 +170,14 @@ Events explained:
|
|||||||
|
|
||||||
### Profile with perf record
|
### Profile with perf record
|
||||||
```bash
|
```bash
|
||||||
perf record -e cache-misses ./cache_demo
|
perf record -e cache-misses ./matrix_col_major
|
||||||
perf report
|
perf report
|
||||||
```
|
```
|
||||||
|
|
||||||
This shows which functions cause the most cache misses.
|
This shows which functions cause the most cache misses.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Discussion Questions
|
## Discussion Questions
|
||||||
|
|
||||||
1. **Why doesn't the compiler fix this?**
|
1. **Why doesn't the compiler fix this?**
|
||||||
|
|||||||
65
scenario4-cache-misses/array_sum.c
Normal file
65
scenario4-cache-misses/array_sum.c
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
/*
|
||||||
|
* GOOD: Contiguous Array Traversal
|
||||||
|
* =================================
|
||||||
|
* This program uses a contiguous array for excellent cache locality.
|
||||||
|
* The CPU prefetcher can predict sequential access patterns.
|
||||||
|
*
|
||||||
|
* Compile: make array_sum
|
||||||
|
* Run: ./array_sum
|
||||||
|
* Profile: perf stat -e cache-misses,cache-references ./array_sum
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
#define N 10000000 /* 10 million elements */
|
||||||
|
|
||||||
|
double get_time(void) {
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
return ts.tv_sec + ts.tv_nsec / 1e9;
|
||||||
|
}
|
||||||
|
|
||||||
|
long sum_array(int *arr, int n) {
|
||||||
|
long sum = 0;
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
sum += arr[i];
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
int *create_array(int n) {
|
||||||
|
int *arr = malloc(n * sizeof(int));
|
||||||
|
if (!arr) {
|
||||||
|
perror("malloc array");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
arr[i] = i % 100;
|
||||||
|
}
|
||||||
|
return arr;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void) {
|
||||||
|
printf("Contiguous Array Traversal (%d elements)\n", N);
|
||||||
|
printf("Sequential memory access - CPU prefetcher works perfectly.\n\n");
|
||||||
|
|
||||||
|
printf("Creating array...\n");
|
||||||
|
int *arr = create_array(N);
|
||||||
|
|
||||||
|
/* Warm up */
|
||||||
|
sum_array(arr, N);
|
||||||
|
|
||||||
|
double start = get_time();
|
||||||
|
long result = sum_array(arr, N);
|
||||||
|
double elapsed = get_time() - start;
|
||||||
|
|
||||||
|
printf("Array sum: %ld in %.4f seconds\n\n", result, elapsed);
|
||||||
|
|
||||||
|
printf("To see cache behavior, run:\n");
|
||||||
|
printf(" perf stat -e cache-misses,cache-references ./array_sum\n");
|
||||||
|
|
||||||
|
free(arr);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@ -1,109 +0,0 @@
|
|||||||
/*
|
|
||||||
* Scenario 4: Cache Misses - Memory Access Patterns
|
|
||||||
* ==================================================
|
|
||||||
* This program demonstrates the performance impact of memory access patterns.
|
|
||||||
* Row-major vs column-major traversal of a 2D array.
|
|
||||||
*
|
|
||||||
* Compile: gcc -O2 -o cache_demo cache_demo.c
|
|
||||||
*
|
|
||||||
* EXERCISES:
|
|
||||||
* 1. Run: ./cache_demo
|
|
||||||
* 2. Profile: perf stat -e cache-misses,cache-references ./cache_demo
|
|
||||||
* 3. Why is one so much faster?
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <time.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#define ROWS 8192
|
|
||||||
#define COLS 8192
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Global array to ensure it's not optimized away.
|
|
||||||
* This is a 64MB array (8192 * 8192 * sizeof(int) = 256MB if int is 4 bytes)
|
|
||||||
* Wait, that's too big. Let's use smaller dimensions or chars.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Using static to avoid stack overflow */
|
|
||||||
static int matrix[ROWS][COLS];
|
|
||||||
|
|
||||||
double get_time(void) {
|
|
||||||
struct timespec ts;
|
|
||||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
||||||
return ts.tv_sec + ts.tv_nsec / 1e9;
|
|
||||||
}
|
|
||||||
|
|
||||||
long sum_row_major(void) {
|
|
||||||
/*
|
|
||||||
* Row-major traversal: access sequential memory addresses
|
|
||||||
* Memory layout: [0][0], [0][1], [0][2], ... [0][COLS-1], [1][0], ...
|
|
||||||
* This matches how C stores 2D arrays - CACHE FRIENDLY
|
|
||||||
*/
|
|
||||||
long sum = 0;
|
|
||||||
for (int i = 0; i < ROWS; i++) {
|
|
||||||
for (int j = 0; j < COLS; j++) {
|
|
||||||
sum += matrix[i][j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
long sum_col_major(void) {
|
|
||||||
/*
|
|
||||||
* Column-major traversal: jump around in memory
|
|
||||||
* Access pattern: [0][0], [1][0], [2][0], ... [ROWS-1][0], [0][1], ...
|
|
||||||
* Each access is COLS * sizeof(int) bytes apart - CACHE HOSTILE
|
|
||||||
*/
|
|
||||||
long sum = 0;
|
|
||||||
for (int j = 0; j < COLS; j++) {
|
|
||||||
for (int i = 0; i < ROWS; i++) {
|
|
||||||
sum += matrix[i][j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
void init_matrix(void) {
|
|
||||||
/* Initialize with some values */
|
|
||||||
for (int i = 0; i < ROWS; i++) {
|
|
||||||
for (int j = 0; j < COLS; j++) {
|
|
||||||
matrix[i][j] = (i + j) % 100;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(void) {
|
|
||||||
printf("Matrix size: %d x %d = %zu bytes\n",
|
|
||||||
ROWS, COLS, sizeof(matrix));
|
|
||||||
printf("Cache line size (typical): 64 bytes\n");
|
|
||||||
printf("Stride in column-major: %zu bytes\n\n", COLS * sizeof(int));
|
|
||||||
|
|
||||||
init_matrix();
|
|
||||||
|
|
||||||
double start, elapsed;
|
|
||||||
long result;
|
|
||||||
|
|
||||||
/* Warm up */
|
|
||||||
result = sum_row_major();
|
|
||||||
result = sum_col_major();
|
|
||||||
|
|
||||||
/* Row-major benchmark */
|
|
||||||
start = get_time();
|
|
||||||
result = sum_row_major();
|
|
||||||
elapsed = get_time() - start;
|
|
||||||
printf("Row-major sum: %ld in %.3f seconds\n", result, elapsed);
|
|
||||||
|
|
||||||
/* Column-major benchmark */
|
|
||||||
start = get_time();
|
|
||||||
result = sum_col_major();
|
|
||||||
elapsed = get_time() - start;
|
|
||||||
printf("Column-major sum: %ld in %.3f seconds\n", result, elapsed);
|
|
||||||
|
|
||||||
printf("\n");
|
|
||||||
printf("To see cache misses, run:\n");
|
|
||||||
printf(" perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo\n");
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
123
scenario4-cache-misses/list_scattered.c
Normal file
123
scenario4-cache-misses/list_scattered.c
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
/*
|
||||||
|
* BAD: Scattered Linked List Traversal
|
||||||
|
* =====================================
|
||||||
|
* This program creates a linked list with nodes scattered randomly in memory,
|
||||||
|
* simulating real-world fragmented allocation patterns.
|
||||||
|
* This causes terrible cache behavior due to random memory access.
|
||||||
|
*
|
||||||
|
* Compile: make list_scattered
|
||||||
|
* Run: ./list_scattered
|
||||||
|
* Profile: perf stat -e cache-misses,cache-references ./list_scattered
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
#define N 10000000 /* 10 million elements */
|
||||||
|
|
||||||
|
struct node {
|
||||||
|
int value;
|
||||||
|
struct node *next;
|
||||||
|
};
|
||||||
|
|
||||||
|
double get_time(void) {
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
return ts.tv_sec + ts.tv_nsec / 1e9;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fast deterministic PRNG - much faster than rand() */
|
||||||
|
static uint64_t xorshift64_state = 42;
|
||||||
|
|
||||||
|
static inline uint64_t xorshift64(void) {
|
||||||
|
uint64_t x = xorshift64_state;
|
||||||
|
x ^= x << 13;
|
||||||
|
x ^= x >> 7;
|
||||||
|
x ^= x << 17;
|
||||||
|
xorshift64_state = x;
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
long sum_list(struct node *head) {
|
||||||
|
long sum = 0;
|
||||||
|
struct node *curr = head;
|
||||||
|
while (curr != NULL) {
|
||||||
|
sum += curr->value;
|
||||||
|
curr = curr->next;
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create linked list with nodes scattered in memory (worst case for cache)
|
||||||
|
* Each node is allocated individually, then shuffled and linked randomly.
|
||||||
|
*/
|
||||||
|
struct node *create_list_scattered(int n) {
|
||||||
|
struct node **nodes = malloc(n * sizeof(struct node *));
|
||||||
|
if (!nodes) {
|
||||||
|
perror("malloc");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Allocate each node separately - they end up scattered in heap */
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
nodes[i] = malloc(sizeof(struct node));
|
||||||
|
if (!nodes[i]) {
|
||||||
|
perror("malloc node");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
nodes[i]->value = i % 100;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Shuffle the order (Fisher-Yates) to ensure random access pattern */
|
||||||
|
for (int i = n - 1; i > 0; i--) {
|
||||||
|
int j = xorshift64() % (i + 1);
|
||||||
|
struct node *tmp = nodes[i];
|
||||||
|
nodes[i] = nodes[j];
|
||||||
|
nodes[j] = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Link them in shuffled order */
|
||||||
|
for (int i = 0; i < n - 1; i++) {
|
||||||
|
nodes[i]->next = nodes[i + 1];
|
||||||
|
}
|
||||||
|
nodes[n - 1]->next = NULL;
|
||||||
|
|
||||||
|
struct node *head = nodes[0];
|
||||||
|
free(nodes); /* Free the pointer array, not the nodes */
|
||||||
|
return head;
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_scattered_list(struct node *head) {
|
||||||
|
while (head != NULL) {
|
||||||
|
struct node *next = head->next;
|
||||||
|
free(head);
|
||||||
|
head = next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void) {
|
||||||
|
printf("Scattered Linked List Traversal (%d elements)\n", N);
|
||||||
|
printf("Each node allocated individually, then linked in random order.\n");
|
||||||
|
printf("This causes maximum cache thrashing.\n\n");
|
||||||
|
|
||||||
|
printf("Creating scattered linked list (this takes a while)...\n");
|
||||||
|
struct node *list = create_list_scattered(N);
|
||||||
|
|
||||||
|
/* Warm up */
|
||||||
|
sum_list(list);
|
||||||
|
|
||||||
|
double start = get_time();
|
||||||
|
long result = sum_list(list);
|
||||||
|
double elapsed = get_time() - start;
|
||||||
|
|
||||||
|
printf("Scattered list sum: %ld in %.4f seconds\n\n", result, elapsed);
|
||||||
|
|
||||||
|
printf("To see cache behavior, run:\n");
|
||||||
|
printf(" perf stat -e cache-misses,cache-references ./list_scattered\n");
|
||||||
|
|
||||||
|
free_scattered_list(list);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
83
scenario4-cache-misses/list_sequential.c
Normal file
83
scenario4-cache-misses/list_sequential.c
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
/*
|
||||||
|
* MEDIUM: Sequential Linked List Traversal
|
||||||
|
* =========================================
|
||||||
|
* This program creates a linked list with nodes allocated contiguously,
|
||||||
|
* representing the best-case scenario for linked lists.
|
||||||
|
* Still slower than arrays due to pointer chasing overhead.
|
||||||
|
*
|
||||||
|
* Compile: make list_sequential
|
||||||
|
* Run: ./list_sequential
|
||||||
|
* Profile: perf stat -e cache-misses,cache-references ./list_sequential
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
#define N 10000000 /* 10 million elements */
|
||||||
|
|
||||||
|
struct node {
|
||||||
|
int value;
|
||||||
|
struct node *next;
|
||||||
|
};
|
||||||
|
|
||||||
|
double get_time(void) {
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
return ts.tv_sec + ts.tv_nsec / 1e9;
|
||||||
|
}
|
||||||
|
|
||||||
|
long sum_list(struct node *head) {
|
||||||
|
long sum = 0;
|
||||||
|
struct node *curr = head;
|
||||||
|
while (curr != NULL) {
|
||||||
|
sum += curr->value;
|
||||||
|
curr = curr->next;
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create linked list with nodes allocated sequentially (best case for list)
|
||||||
|
* All nodes allocated in one contiguous block, linked in order.
|
||||||
|
*/
|
||||||
|
struct node *create_list_sequential(int n) {
|
||||||
|
struct node *nodes = malloc(n * sizeof(struct node));
|
||||||
|
if (!nodes) {
|
||||||
|
perror("malloc list");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n - 1; i++) {
|
||||||
|
nodes[i].value = i % 100;
|
||||||
|
nodes[i].next = &nodes[i + 1];
|
||||||
|
}
|
||||||
|
nodes[n - 1].value = (n - 1) % 100;
|
||||||
|
nodes[n - 1].next = NULL;
|
||||||
|
|
||||||
|
return nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void) {
|
||||||
|
printf("Sequential Linked List Traversal (%d elements)\n", N);
|
||||||
|
printf("All nodes allocated contiguously - best case for linked list.\n");
|
||||||
|
printf("Still has pointer chasing overhead vs array.\n\n");
|
||||||
|
|
||||||
|
printf("Creating sequential linked list...\n");
|
||||||
|
struct node *list = create_list_sequential(N);
|
||||||
|
|
||||||
|
/* Warm up */
|
||||||
|
sum_list(list);
|
||||||
|
|
||||||
|
double start = get_time();
|
||||||
|
long result = sum_list(list);
|
||||||
|
double elapsed = get_time() - start;
|
||||||
|
|
||||||
|
printf("Sequential list sum: %ld in %.4f seconds\n\n", result, elapsed);
|
||||||
|
|
||||||
|
printf("To see cache behavior, run:\n");
|
||||||
|
printf(" perf stat -e cache-misses,cache-references ./list_sequential\n");
|
||||||
|
|
||||||
|
free(list);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@ -1,175 +0,0 @@
|
|||||||
/*
|
|
||||||
* Scenario 4b: Array vs Linked List Traversal
|
|
||||||
* ============================================
|
|
||||||
* Arrays have excellent cache locality; linked lists do not.
|
|
||||||
* This demonstrates why "O(n) vs O(n)" can have very different constants.
|
|
||||||
*
|
|
||||||
* Compile: gcc -O2 -o list_vs_array list_vs_array.c
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <time.h>
|
|
||||||
|
|
||||||
#define N 10000000 /* 10 million elements */
|
|
||||||
|
|
||||||
struct node {
|
|
||||||
int value;
|
|
||||||
struct node *next;
|
|
||||||
};
|
|
||||||
|
|
||||||
double get_time(void) {
|
|
||||||
struct timespec ts;
|
|
||||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
||||||
return ts.tv_sec + ts.tv_nsec / 1e9;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Sum array elements */
|
|
||||||
long sum_array(int *arr, int n) {
|
|
||||||
long sum = 0;
|
|
||||||
for (int i = 0; i < n; i++) {
|
|
||||||
sum += arr[i];
|
|
||||||
}
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Sum linked list elements */
|
|
||||||
long sum_list(struct node *head) {
|
|
||||||
long sum = 0;
|
|
||||||
struct node *curr = head;
|
|
||||||
while (curr != NULL) {
|
|
||||||
sum += curr->value;
|
|
||||||
curr = curr->next;
|
|
||||||
}
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Create array */
|
|
||||||
int *create_array(int n) {
|
|
||||||
int *arr = malloc(n * sizeof(int));
|
|
||||||
if (!arr) {
|
|
||||||
perror("malloc array");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
for (int i = 0; i < n; i++) {
|
|
||||||
arr[i] = i % 100;
|
|
||||||
}
|
|
||||||
return arr;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Create linked list - nodes allocated sequentially (best case for list) */
|
|
||||||
struct node *create_list_sequential(int n) {
|
|
||||||
struct node *nodes = malloc(n * sizeof(struct node));
|
|
||||||
if (!nodes) {
|
|
||||||
perror("malloc list");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < n - 1; i++) {
|
|
||||||
nodes[i].value = i % 100;
|
|
||||||
nodes[i].next = &nodes[i + 1];
|
|
||||||
}
|
|
||||||
nodes[n - 1].value = (n - 1) % 100;
|
|
||||||
nodes[n - 1].next = NULL;
|
|
||||||
|
|
||||||
return nodes;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Create linked list - nodes allocated randomly (worst case for cache) */
|
|
||||||
struct node *create_list_scattered(int n) {
|
|
||||||
/* Allocate nodes individually to scatter them in memory */
|
|
||||||
struct node **nodes = malloc(n * sizeof(struct node *));
|
|
||||||
if (!nodes) {
|
|
||||||
perror("malloc");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Allocate each node separately */
|
|
||||||
for (int i = 0; i < n; i++) {
|
|
||||||
nodes[i] = malloc(sizeof(struct node));
|
|
||||||
if (!nodes[i]) {
|
|
||||||
perror("malloc node");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
nodes[i]->value = i % 100;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Shuffle the order (Fisher-Yates) */
|
|
||||||
srand(42);
|
|
||||||
for (int i = n - 1; i > 0; i--) {
|
|
||||||
int j = rand() % (i + 1);
|
|
||||||
struct node *tmp = nodes[i];
|
|
||||||
nodes[i] = nodes[j];
|
|
||||||
nodes[j] = tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Link them in shuffled order */
|
|
||||||
for (int i = 0; i < n - 1; i++) {
|
|
||||||
nodes[i]->next = nodes[i + 1];
|
|
||||||
}
|
|
||||||
nodes[n - 1]->next = NULL;
|
|
||||||
|
|
||||||
struct node *head = nodes[0];
|
|
||||||
free(nodes); /* Free the pointer array, not the nodes */
|
|
||||||
return head;
|
|
||||||
}
|
|
||||||
|
|
||||||
void free_scattered_list(struct node *head) {
|
|
||||||
while (head != NULL) {
|
|
||||||
struct node *next = head->next;
|
|
||||||
free(head);
|
|
||||||
head = next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(void) {
|
|
||||||
printf("Comparing array vs linked list traversal (%d elements)\n\n", N);
|
|
||||||
|
|
||||||
double start, elapsed;
|
|
||||||
long result;
|
|
||||||
|
|
||||||
/* Array */
|
|
||||||
printf("Creating array...\n");
|
|
||||||
int *arr = create_array(N);
|
|
||||||
|
|
||||||
start = get_time();
|
|
||||||
result = sum_array(arr, N);
|
|
||||||
elapsed = get_time() - start;
|
|
||||||
printf("Array sum: %ld in %.4f seconds\n", result, elapsed);
|
|
||||||
double array_time = elapsed;
|
|
||||||
free(arr);
|
|
||||||
|
|
||||||
/* Sequential linked list (best case for list) */
|
|
||||||
printf("\nCreating sequential linked list...\n");
|
|
||||||
struct node *list_seq = create_list_sequential(N);
|
|
||||||
|
|
||||||
start = get_time();
|
|
||||||
result = sum_list(list_seq);
|
|
||||||
elapsed = get_time() - start;
|
|
||||||
printf("List sum (sequential): %ld in %.4f seconds\n", result, elapsed);
|
|
||||||
double list_seq_time = elapsed;
|
|
||||||
free(list_seq);
|
|
||||||
|
|
||||||
/* Scattered linked list (worst case for cache) */
|
|
||||||
printf("\nCreating scattered linked list (this takes a while)...\n");
|
|
||||||
struct node *list_scat = create_list_scattered(N);
|
|
||||||
|
|
||||||
start = get_time();
|
|
||||||
result = sum_list(list_scat);
|
|
||||||
elapsed = get_time() - start;
|
|
||||||
printf("List sum (scattered): %ld in %.4f seconds\n", result, elapsed);
|
|
||||||
double list_scat_time = elapsed;
|
|
||||||
free_scattered_list(list_scat);
|
|
||||||
|
|
||||||
printf("\n--- Summary ---\n");
|
|
||||||
printf("Array: %.4fs (baseline)\n", array_time);
|
|
||||||
printf("List (sequential): %.4fs (%.1fx slower)\n",
|
|
||||||
list_seq_time, list_seq_time / array_time);
|
|
||||||
printf("List (scattered): %.4fs (%.1fx slower)\n",
|
|
||||||
list_scat_time, list_scat_time / array_time);
|
|
||||||
|
|
||||||
printf("\nTo see cache behavior:\n");
|
|
||||||
printf(" perf stat -e cache-misses,cache-references ./list_vs_array\n");
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
73
scenario4-cache-misses/matrix_col_major.c
Normal file
73
scenario4-cache-misses/matrix_col_major.c
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
/*
|
||||||
|
* BAD: Column-Major Matrix Traversal
|
||||||
|
* ===================================
|
||||||
|
* This program traverses a 2D matrix in column-major order,
|
||||||
|
* which causes poor cache utilization because C stores arrays in row-major order.
|
||||||
|
*
|
||||||
|
* Compile: make matrix_col_major
|
||||||
|
* Run: ./matrix_col_major
|
||||||
|
* Profile: perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_col_major
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
#define ROWS 8192
|
||||||
|
#define COLS 8192
|
||||||
|
|
||||||
|
/* Using static to avoid stack overflow */
|
||||||
|
static int matrix[ROWS][COLS];
|
||||||
|
|
||||||
|
double get_time(void) {
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
return ts.tv_sec + ts.tv_nsec / 1e9;
|
||||||
|
}
|
||||||
|
|
||||||
|
void init_matrix(void) {
|
||||||
|
for (int i = 0; i < ROWS; i++) {
|
||||||
|
for (int j = 0; j < COLS; j++) {
|
||||||
|
matrix[i][j] = (i + j) % 100;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Column-major traversal: jump around in memory
|
||||||
|
* Access pattern: [0][0], [1][0], [2][0], ... [ROWS-1][0], [0][1], ...
|
||||||
|
* Each access is COLS * sizeof(int) bytes apart - CACHE HOSTILE
|
||||||
|
*/
|
||||||
|
long sum_col_major(void) {
|
||||||
|
long sum = 0;
|
||||||
|
for (int j = 0; j < COLS; j++) {
|
||||||
|
for (int i = 0; i < ROWS; i++) {
|
||||||
|
sum += matrix[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void) {
|
||||||
|
printf("Matrix size: %d x %d = %zu bytes\n",
|
||||||
|
ROWS, COLS, sizeof(matrix));
|
||||||
|
printf("Cache line size (typical): 64 bytes\n");
|
||||||
|
printf("Stride per access: %zu bytes (jumps over entire row!)\n\n",
|
||||||
|
COLS * sizeof(int));
|
||||||
|
|
||||||
|
init_matrix();
|
||||||
|
|
||||||
|
/* Warm up */
|
||||||
|
sum_col_major();
|
||||||
|
|
||||||
|
double start = get_time();
|
||||||
|
long result = sum_col_major();
|
||||||
|
double elapsed = get_time() - start;
|
||||||
|
|
||||||
|
printf("Column-major sum: %ld in %.3f seconds\n\n", result, elapsed);
|
||||||
|
|
||||||
|
printf("To see cache misses, run:\n");
|
||||||
|
printf(" perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_col_major\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
73
scenario4-cache-misses/matrix_row_major.c
Normal file
73
scenario4-cache-misses/matrix_row_major.c
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
/*
|
||||||
|
* GOOD: Row-Major Matrix Traversal
|
||||||
|
* =================================
|
||||||
|
* This program traverses a 2D matrix in row-major order,
|
||||||
|
* matching how C stores 2D arrays in memory for excellent cache utilization.
|
||||||
|
*
|
||||||
|
* Compile: make matrix_row_major
|
||||||
|
* Run: ./matrix_row_major
|
||||||
|
* Profile: perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_row_major
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
#define ROWS 8192
|
||||||
|
#define COLS 8192
|
||||||
|
|
||||||
|
/* Using static to avoid stack overflow */
|
||||||
|
static int matrix[ROWS][COLS];
|
||||||
|
|
||||||
|
double get_time(void) {
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
return ts.tv_sec + ts.tv_nsec / 1e9;
|
||||||
|
}
|
||||||
|
|
||||||
|
void init_matrix(void) {
|
||||||
|
for (int i = 0; i < ROWS; i++) {
|
||||||
|
for (int j = 0; j < COLS; j++) {
|
||||||
|
matrix[i][j] = (i + j) % 100;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Row-major traversal: access sequential memory addresses
|
||||||
|
* Memory layout: [0][0], [0][1], [0][2], ... [0][COLS-1], [1][0], ...
|
||||||
|
* This matches how C stores 2D arrays - CACHE FRIENDLY
|
||||||
|
*/
|
||||||
|
long sum_row_major(void) {
|
||||||
|
long sum = 0;
|
||||||
|
for (int i = 0; i < ROWS; i++) {
|
||||||
|
for (int j = 0; j < COLS; j++) {
|
||||||
|
sum += matrix[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void) {
|
||||||
|
printf("Matrix size: %d x %d = %zu bytes\n",
|
||||||
|
ROWS, COLS, sizeof(matrix));
|
||||||
|
printf("Cache line size (typical): 64 bytes\n");
|
||||||
|
printf("Stride per access: %zu bytes (sequential!)\n\n",
|
||||||
|
sizeof(int));
|
||||||
|
|
||||||
|
init_matrix();
|
||||||
|
|
||||||
|
/* Warm up */
|
||||||
|
sum_row_major();
|
||||||
|
|
||||||
|
double start = get_time();
|
||||||
|
long result = sum_row_major();
|
||||||
|
double elapsed = get_time() - start;
|
||||||
|
|
||||||
|
printf("Row-major sum: %ld in %.3f seconds\n\n", result, elapsed);
|
||||||
|
|
||||||
|
printf("To see cache behavior, run:\n");
|
||||||
|
printf(" perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./matrix_row_major\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user