commit 4fb1bd90db7b53ea71fa8076ddaac9b8911b4b43
Author: illustris <rharikrishnan95@gmail.com>
Date:   Thu Jan 8 18:11:30 2026 +0530

    init

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b25c15b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*~
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8d720c1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,275 @@
+# Linux Performance Engineering Workshop
+
+## 4-Hour Hands-On Training for BITS Pilani Goa
+
+### Prerequisites
+- Basic C programming knowledge
+- Basic Python knowledge
+- Familiarity with command line
+- Ubuntu 22.04/24.04 (or similar Linux)
+
+---
+
+## Workshop Overview
+
+This workshop teaches practical performance engineering skills using libre tools on Linux.
+By the end, you'll be able to identify and fix common performance problems.
+
+### What You'll Learn
+- How to measure program performance (not guess!)
+- CPU profiling with `perf` and flamegraphs
+- Identifying syscall overhead with `strace`
+- Understanding cache behavior
+- Continuous profiling for production systems
+
+### Philosophy
+> "Measure, don't guess."
+
+Most performance intuitions are wrong. This workshop teaches you to find bottlenecks with data.
+
+---
+
+## Schedule
+
+| Time | Topic | Hands-On |
+|------|-------|----------|
+| 0:00-0:45 | Introduction & Theory | - |
+| 0:45-1:30 | Python Profiling | Scenarios 1 & 2 |
+| 1:30-1:45 | Break | - |
+| 1:45-2:30 | perf & Flamegraphs | Theory + Demo |
+| 2:30-3:30 | Cache & Debug Symbols | Scenarios 4 & 5 |
+| 3:30-4:00 | Lunch Break | - |
+| 4:00-4:30 | Syscalls & I/O | Theory |
+| 4:30-5:15 | Syscall Profiling | Scenario 3 |
+| 5:15-5:30 | Break | - |
+| 5:30-6:00 | Advanced Topics & Wrap-up | Scenarios 6 & 7 |
+
+---
+
+## Setup Instructions
+
+### Install Required Packages
+
+```bash
+# Core tools
+sudo apt update
+sudo apt install -y \
+    build-essential \
+    linux-tools-common \
+    linux-tools-$(uname -r) \
+    strace \
+    ltrace \
+    htop \
+    python3-pip
+
+# Optional but recommended
+sudo apt install -y \
+    hyperfine \
+    valgrind \
+    systemtap-sdt-dev
+
+# Python tools
+pip3 install py-spy
+
+# Pyroscope (for scenario 7)
+# Option A: Docker
+docker pull grafana/pyroscope
+# Option B: Download binary from https://github.com/grafana/pyroscope/releases
+
+# FlameGraph scripts
+cd ~
+git clone https://github.com/brendangregg/FlameGraph.git
+```
+
+### Configure perf Permissions
+
+```bash
+# Allow perf for non-root users (needed for this workshop)
+sudo sysctl -w kernel.perf_event_paranoid=1
+
+# To make permanent:
+echo 'kernel.perf_event_paranoid=1' | sudo tee -a /etc/sysctl.conf
+```
+
+### Verify Installation
+
+```bash
+# Should all work without errors:
+perf --version
+strace --version
+py-spy --version
+gcc --version
+python3 --version
+```
+
+---
+
+## Directory Structure
+
+```
+perf-workshop/
+├── README.md                    # This file
+├── common/
+│   └── CHEATSHEET.md           # Quick reference card
+├── scenario1-python-to-c/
+│   ├── README.md
+│   ├── prime_slow.py           # Slow Python version
+│   ├── prime.c                 # C implementation
+│   └── prime_fast.py           # Python + C via ctypes
+├── scenario2-memoization/
+│   ├── README.md
+│   ├── fib_slow.py             # Naive recursive Fibonacci
+│   ├── fib_cached.py           # Memoized Fibonacci
+│   └── config_validator.py     # Precomputation example
+├── scenario3-syscall-storm/
+│   ├── README.md
+│   ├── Makefile
+│   ├── read_slow.c             # Byte-by-byte reads
+│   ├── read_fast.c             # Buffered reads
+│   ├── read_stdio.c            # stdio buffering
+│   └── read_python.py          # Python equivalent
+├── scenario4-cache-misses/
+│   ├── README.md
+│   ├── Makefile
+│   ├── cache_demo.c            # Row vs column major
+│   └── list_vs_array.c         # Array vs linked list
+├── scenario5-debug-symbols/
+│   ├── README.md
+│   ├── Makefile
+│   └── program.c               # Multi-function program
+├── scenario6-usdt-probes/
+│   ├── README.md
+│   ├── Makefile
+│   └── server.c                # Program with USDT probes
+└── scenario7-pyroscope/
+    ├── README.md
+    ├── requirements.txt
+    ├── app.py                  # Flask app with Pyroscope
+    └── loadgen.sh              # Load generator script
+```
+
+---
+
+## Quick Start
+
+### Build Everything
+
+```bash
+# Build all C programs
+for dir in scenario{3,4,5,6}*/; do
+    if [ -f "$dir/Makefile" ]; then
+        echo "Building $dir"
+        make -C "$dir"
+    fi
+done
+
+# Build scenario 1 C library
+cd scenario1-python-to-c
+gcc -O2 -fPIC -shared -o libprime.so prime.c
+cd ..
+```
+
+### Run a Scenario
+
+Each scenario has its own README with step-by-step instructions.
+Start with:
+
+```bash
+cd scenario1-python-to-c
+cat README.md
+```
+
+---
+
+## Key Concepts Summary
+
+### 1. Types of Bottlenecks
+
+| Type | Symptom | Tool |
+|------|---------|------|
+| CPU-bound | `user` time is high | `perf record` |
+| Syscall-bound | `sys` time is high | `strace -c` |
+| I/O-bound | Low CPU, slow wall time | `strace`, `iostat` |
+| Memory-bound | High cache misses | `perf stat` |
+
+### 2. Profiling Workflow
+
+```
+1. Measure: time ./program
+2. Hypothesize: Where is time spent?
+3. Profile: perf/strace/cProfile
+4. Analyze: Find hot spots
+5. Optimize: Fix the bottleneck
+6. Verify: Re-measure
+```
+
+### 3. Tool Selection
+
+| Task | Tool |
+|------|------|
+| Basic timing | `time` |
+| CPU sampling | `perf record` |
+| Hardware counters | `perf stat` |
+| Syscall tracing | `strace -c` |
+| Python profiling | `cProfile`, `py-spy` |
+| Visualization | Flamegraphs |
+| Continuous profiling | Pyroscope |
+
+---
+
+## Further Learning
+
+### Books
+- "Systems Performance" by Brendan Gregg
+- "BPF Performance Tools" by Brendan Gregg
+
+### Online Resources
+- https://www.brendangregg.com/linuxperf.html
+- https://perf.wiki.kernel.org/
+- https://jvns.ca/blog/2016/03/12/how-does-perf-work-and-some-questions/
+
+### Tools to Explore Later
+- `bpftrace` - High-level tracing language
+- `eBPF` - In-kernel programmability
+- `Valgrind` - Memory profiling
+- `gprof` - Traditional profiler
+
+---
+
+## Troubleshooting
+
+### "perf: command not found"
+```bash
+sudo apt install linux-tools-common linux-tools-$(uname -r)
+```
+
+### "Access to performance monitoring operations is limited"
+```bash
+sudo sysctl -w kernel.perf_event_paranoid=1
+```
+
+### "py-spy: Permission denied"
+Either run as root or use `--nonblocking`:
+```bash
+sudo py-spy record -o profile.svg -- python3 script.py
+# Or:
+py-spy record --nonblocking -o profile.svg -- python3 script.py
+```
+
+### "No debug symbols"
+Recompile with `-g`:
+```bash
+gcc -O2 -g -o program program.c
+```
+
+---
+
+## Feedback
+
+Found an issue? Have suggestions?
+Please provide feedback to your instructor!
+
+---
+
+*Workshop materials prepared for BITS Pilani Goa*
+*Tools: All libre/open-source software*
diff --git a/common/CHEATSHEET.md b/common/CHEATSHEET.md
new file mode 100644
index 0000000..0b637c3
--- /dev/null
+++ b/common/CHEATSHEET.md
@@ -0,0 +1,256 @@
+# Linux Performance Tools Cheatsheet
+
+## Quick Reference Card for Workshop
+
+---
+
+## time - Basic Timing
+
+```bash
+time ./program           # Wall clock, user, sys time
+time -v ./program        # Verbose (GNU time, may need /usr/bin/time -v)
+```
+
+**Output explained:**
+- `real` - Wall clock time (what a stopwatch would show)
+- `user` - CPU time in user space (your code)
+- `sys`  - CPU time in kernel space (syscalls)
+
+---
+
+## perf stat - Hardware Counters
+
+```bash
+# Basic stats
+perf stat ./program
+
+# Specific events
+perf stat -e cycles,instructions,cache-misses ./program
+
+# Repeat for statistical accuracy
+perf stat -r 5 ./program
+
+# Common events
+perf stat -e cycles,instructions,cache-references,cache-misses,branches,branch-misses ./program
+```
+
+**Key metrics:**
+- **IPC** (Instructions Per Cycle): Higher is better, >1 is good
+- **Cache miss ratio**: Lower is better
+- **Branch misses**: Lower is better
+
+---
+
+## perf record/report - CPU Sampling
+
+```bash
+# Record samples
+perf record ./program
+perf record -g ./program              # With call graphs
+perf record -F 99 ./program           # Custom frequency (99 Hz)
+
+# Analyze
+perf report                           # Interactive TUI
+perf report --stdio                   # Text output
+perf report -n --stdio                # With sample counts
+```
+
+**TUI navigation:**
+- Arrow keys: Navigate
+- Enter: Zoom into function
+- `a`: Annotate (show source/assembly)
+- `q`: Quit/back
+
+---
+
+## perf annotate - Source-Level View
+
+```bash
+# After perf record
+perf annotate function_name           # Show hot lines
+perf annotate -s function_name        # Source view (needs -g)
+```
+
+**Requires:** Compiled with `-g` for source view
+
+---
+
+## strace - Syscall Tracing
+
+```bash
+# Summary of syscalls
+strace -c ./program
+
+# Trace specific syscalls
+strace -e read,write ./program
+
+# With timing per call
+strace -T ./program
+
+# Follow forks
+strace -f ./program
+
+# Output to file
+strace -o trace.log ./program
+```
+
+**Key columns in `-c` output:**
+- `% time`: Percentage of total syscall time
+- `calls`: Number of times called
+- `errors`: Failed calls
+
+---
+
+## Flamegraphs
+
+```bash
+# Clone FlameGraph repo (one time)
+git clone https://github.com/brendangregg/FlameGraph.git
+
+# Generate flamegraph
+perf record -g ./program
+perf script | ./FlameGraph/stackcollapse-perf.pl | ./FlameGraph/flamegraph.pl > profile.svg
+
+# Open in browser
+firefox profile.svg
+```
+
+---
+
+## Python Profiling
+
+```bash
+# cProfile - built-in profiler
+python3 -m cProfile script.py
+python3 -m cProfile -s cumtime script.py     # Sort by cumulative time
+python3 -m cProfile -s ncalls script.py      # Sort by call count
+python3 -m cProfile -o profile.stats script.py  # Save for later analysis
+
+# py-spy - sampling profiler (low overhead)
+pip install py-spy
+py-spy record -o profile.svg -- python3 script.py   # Flamegraph
+py-spy top -- python3 script.py                     # Live view
+
+# Attach to running process
+py-spy top --pid 12345
+```
+
+---
+
+## /proc Filesystem
+
+```bash
+# Process info
+cat /proc/<pid>/status          # Process status
+cat /proc/<pid>/maps            # Memory mappings
+cat /proc/<pid>/fd              # Open file descriptors
+cat /proc/<pid>/smaps           # Detailed memory info
+
+# System info
+cat /proc/cpuinfo               # CPU details
+cat /proc/meminfo               # Memory details
+cat /proc/loadavg               # Load average
+```
+
+---
+
+## htop - System Overview
+
+```bash
+htop                            # Interactive process viewer
+```
+
+**Key shortcuts:**
+- `F6`: Sort by column
+- `F5`: Tree view
+- `F9`: Kill process
+- `t`: Toggle tree view
+- `H`: Toggle user threads
+
+---
+
+## hyperfine - Benchmarking
+
+```bash
+# Basic benchmark
+hyperfine './program'
+
+# Compare two programs
+hyperfine './program_v1' './program_v2'
+
+# With warmup
+hyperfine --warmup 3 './program'
+
+# Export results
+hyperfine --export-markdown results.md './program'
+```
+
+---
+
+## Quick Diagnosis Flowchart
+
+```
+Program is slow
+       │
+       ▼
+┌──────────────────┐
+│ time ./program   │
+└────────┬─────────┘
+         │
+         ▼
+   ┌─────────────────────────────────┐
+   │ Is 'sys' time high?              │
+   └─────────┬───────────┬───────────┘
+           YES           NO
+             │             │
+             ▼             ▼
+      ┌──────────┐  ┌──────────────┐
+      │ strace -c │  │ perf record  │
+      │ (syscalls)│  │ (CPU profile)│
+      └──────────┘  └──────────────┘
+```
+
+---
+
+## Permission Issues
+
+```bash
+# Allow perf for non-root (temporary)
+sudo sysctl -w kernel.perf_event_paranoid=1
+
+# Allow perf for non-root (permanent)
+echo 'kernel.perf_event_paranoid=1' | sudo tee -a /etc/sysctl.conf
+
+# Run perf as root if needed
+sudo perf record ./program
+```
+
+---
+
+## Useful One-Liners
+
+```bash
+# Top 10 functions by CPU time
+perf report -n --stdio | head -20
+
+# Count syscalls by type
+strace -c ./program 2>&1 | tail -20
+
+# Watch cache misses in real-time
+perf stat -e cache-misses -I 1000 -p <pid>
+
+# Find memory-mapped files for a process
+cat /proc/<pid>/maps | grep -v '\[' | awk '{print $6}' | sort -u
+
+# Monitor a Python process
+py-spy top --pid $(pgrep -f 'python.*myapp')
+```
+
+---
+
+## Resources
+
+- **Brendan Gregg's site**: https://www.brendangregg.com/linuxperf.html
+- **perf wiki**: https://perf.wiki.kernel.org/
+- **FlameGraph repo**: https://github.com/brendangregg/FlameGraph
+- **py-spy docs**: https://github.com/benfred/py-spy
diff --git a/scenario1-python-to-c/README.md b/scenario1-python-to-c/README.md
new file mode 100644
index 0000000..4f8dc14
--- /dev/null
+++ b/scenario1-python-to-c/README.md
@@ -0,0 +1,55 @@
+# Scenario 1: Python to C Optimization
+
+## Learning Objectives
+- Use `time` to measure execution time
+- Profile Python code with `cProfile`
+- Generate flamegraphs with `py-spy`
+- Use `ctypes` to call C code from Python
+
+## Files
+- `prime_slow.py` - Pure Python implementation (slow)
+- `prime.c` - C implementation of the hot function
+- `prime_fast.py` - Python calling C via ctypes
+
+## Exercises
+
+### Step 1: Measure the baseline
+```bash
+time python3 prime_slow.py 100000
+```
+
+Note the `real` time (wall clock) and `user` time (CPU time in user space).
+
+### Step 2: Profile with cProfile
+```bash
+python3 -m cProfile -s cumtime prime_slow.py 100000
+```
+
+Look for:
+- Which function has the highest `cumtime` (cumulative time)?
+- How many times (`ncalls`) is `is_prime` called?
+
+### Step 3: Generate a flamegraph
+```bash
+py-spy record -o prime_slow.svg -- python3 prime_slow.py
+```
+
+Open `prime_slow.svg` in a browser. The widest bar at the top shows where time is spent.
+
+### Step 4: Compile and run the optimized version
+```bash
+# Compile the C library
+gcc -O2 -fPIC -shared -o libprime.so prime.c
+
+# Run the fast version
+time python3 prime_fast.py 100000
+```
+
+### Step 5: Compare
+- How much faster is the C version?
+- Generate a flamegraph for `prime_fast.py` - what's different?
+
+## Discussion Questions
+1. Why is the C version faster? (Hint: interpreter overhead, type checking)
+2. When is it worth rewriting in C vs. finding a library?
+3. What's the trade-off of using `ctypes` vs native Python?
diff --git a/scenario1-python-to-c/prime.c b/scenario1-python-to-c/prime.c
new file mode 100644
index 0000000..44f0233
--- /dev/null
+++ b/scenario1-python-to-c/prime.c
@@ -0,0 +1,36 @@
+/*
+ * Scenario 1: C implementation of is_prime
+ * =========================================
+ * Compile as shared library:
+ *   gcc -O2 -fPIC -shared -o libprime.so prime.c
+ * 
+ * Or with debug symbols for profiling:
+ *   gcc -O2 -g -fPIC -shared -o libprime.so prime.c
+ */
+
+#include <stdint.h>
+
+/* Check if n is prime using trial division */
+int is_prime(int64_t n) {
+    if (n < 2) return 0;
+    if (n == 2) return 1;
+    if (n % 2 == 0) return 0;
+    
+    int64_t i = 3;
+    while (i * i <= n) {
+        if (n % i == 0) return 0;
+        i += 2;
+    }
+    return 1;
+}
+
+/* Count primes up to limit - can also be called from Python */
+int64_t count_primes(int64_t limit) {
+    int64_t count = 0;
+    for (int64_t n = 2; n <= limit; n++) {
+        if (is_prime(n)) {
+            count++;
+        }
+    }
+    return count;
+}
diff --git a/scenario1-python-to-c/prime_fast.py b/scenario1-python-to-c/prime_fast.py
new file mode 100644
index 0000000..cccc9c9
--- /dev/null
+++ b/scenario1-python-to-c/prime_fast.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+"""
+Scenario 1: The Optimized Version
+=================================
+This version calls the C implementation via ctypes.
+
+SETUP:
+    gcc -O2 -fPIC -shared -o libprime.so prime.c
+
+EXERCISES:
+1. Compare: time python3 prime_fast.py
+2. Profile: py-spy record -o prime_fast.svg -- python3 prime_fast.py
+3. Compare the flamegraphs - what changed?
+"""
+
+import ctypes
+import sys
+import os
+
+# Load the shared library
+script_dir = os.path.dirname(os.path.abspath(__file__))
+lib_path = os.path.join(script_dir, "libprime.so")
+
+try:
+    libprime = ctypes.CDLL(lib_path)
+except OSError as e:
+    print(f"Error: Could not load {lib_path}")
+    print("Please compile first: gcc -O2 -fPIC -shared -o libprime.so prime.c")
+    sys.exit(1)
+
+# Define function signatures
+libprime.is_prime.argtypes = [ctypes.c_int64]
+libprime.is_prime.restype = ctypes.c_int
+
+libprime.count_primes.argtypes = [ctypes.c_int64]
+libprime.count_primes.restype = ctypes.c_int64
+
+
+def is_prime(n):
+    """Python wrapper for C is_prime."""
+    return bool(libprime.is_prime(n))
+
+
+def count_primes(limit):
+    """Python wrapper for C count_primes."""
+    return libprime.count_primes(limit)
+
+
+def main():
+    limit = 1_000_000
+    if len(sys.argv) > 1:
+        limit = int(sys.argv[1])
+    
+    print(f"Counting primes up to {limit} (using C library)...")
+    result = count_primes(limit)
+    print(f"Found {result} primes")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scenario1-python-to-c/prime_slow.py b/scenario1-python-to-c/prime_slow.py
new file mode 100644
index 0000000..14866ef
--- /dev/null
+++ b/scenario1-python-to-c/prime_slow.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""
+Scenario 1: The Obvious CPU Hog
+===============================
+This program counts prime numbers using trial division.
+It's intentionally slow to demonstrate profiling.
+
+EXERCISES:
+1. Run with: time python3 prime_slow.py
+2. Profile with: python3 -m cProfile -s cumtime prime_slow.py
+3. Generate flamegraph: py-spy record -o prime_slow.svg -- python3 prime_slow.py
+4. Identify the hot function, then see prime_fast.py for the optimized version
+"""
+
+import sys
+
+
+def is_prime(n):
+    """Check if n is prime using trial division."""
+    if n < 2:
+        return False
+    if n == 2:
+        return True
+    if n % 2 == 0:
+        return False
+    # Check odd divisors up to sqrt(n)
+    i = 3
+    while i * i <= n:
+        if n % i == 0:
+            return False
+        i += 2
+    return True
+
+
+def count_primes(limit):
+    """Count all primes up to limit."""
+    count = 0
+    for n in range(2, limit + 1):
+        if is_prime(n):
+            count += 1
+    return count
+
+
+def main():
+    limit = 1_000_000  # Adjust this to change runtime
+    if len(sys.argv) > 1:
+        limit = int(sys.argv[1])
+    
+    print(f"Counting primes up to {limit}...")
+    result = count_primes(limit)
+    print(f"Found {result} primes")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scenario2-memoization/README.md b/scenario2-memoization/README.md
new file mode 100644
index 0000000..2b2cfb5
--- /dev/null
+++ b/scenario2-memoization/README.md
@@ -0,0 +1,96 @@
+# Scenario 2: Memoization and Precomputation
+
+## Learning Objectives
+- Read cProfile output to identify redundant function calls
+- Use `@functools.lru_cache` for automatic memoization
+- Recognize when precomputation beats memoization
+- Understand space-time trade-offs
+
+## Files
+- `fib_slow.py` - Naive recursive Fibonacci (exponential time)
+- `fib_cached.py` - Memoized Fibonacci (linear time)
+- `config_validator.py` - Comparison of naive, memoized, and precomputed approaches
+
+## Exercise 1: Fibonacci
+
+### Step 1: Experience the slowness
+```bash
+time python3 fib_slow.py 35
+```
+
+This should take several seconds. Don't try n=50!
+
+### Step 2: Profile to understand why
+```bash
+python3 -m cProfile -s ncalls fib_slow.py 35 2>&1 | head -20
+```
+
+Key insight: Look at `ncalls` for the `fib` function. For fib(35), it's called
+millions of times because we recompute the same values repeatedly.
+
+The call tree looks like:
+```
+fib(5)
+├── fib(4)
+│   ├── fib(3)
+│   │   ├── fib(2)
+│   │   └── fib(1)
+│   └── fib(2)
+└── fib(3)        <-- Same as above! Redundant!
+    ├── fib(2)
+    └── fib(1)
+```
+
+### Step 3: Apply memoization
+```bash
+time python3 fib_cached.py 35
+```
+
+Now try a much larger value:
+```bash
+time python3 fib_cached.py 100
+```
+
+### Step 4: Verify the improvement
+```bash
+python3 -m cProfile -s ncalls fib_cached.py 35 2>&1 | head -20
+```
+
+The `ncalls` should now be O(n) instead of O(2^n).
+
+## Exercise 2: Config Validator
+
+This example shows when precomputation is better than memoization.
+
+### Run all three strategies
+```bash
+python3 config_validator.py 5000
+```
+
+### Profile to understand the differences
+```bash
+python3 -m cProfile -s cumtime config_validator.py 5000
+```
+
+### Discussion Questions
+1. Why is precomputation faster than memoization here?
+   - Hint: How many unique inputs are there?
+   - Hint: What's the overhead of cache lookup vs dict lookup?
+
+2. When would memoization be better than precomputation?
+   - Hint: What if there were 10,000 rules and 10,000 event types?
+   - Hint: What if we didn't know the inputs ahead of time?
+
+3. What's the memory trade-off?
+
+## Key Takeaways
+
+| Approach | When to Use |
+|----------|-------------|
+| No caching | Function is cheap OR called once per input |
+| Memoization | Unknown/large input space, function is expensive |
+| Precomputation | Known/small input space, amortize cost over many lookups |
+
+## Further Reading
+- `functools.lru_cache` documentation
+- `functools.cache` (Python 3.9+) - unbounded cache, simpler API
diff --git a/scenario2-memoization/config_validator.py b/scenario2-memoization/config_validator.py
new file mode 100644
index 0000000..784db3c
--- /dev/null
+++ b/scenario2-memoization/config_validator.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""
+Scenario 2b: The Precomputation Insight
+=======================================
+This simulates a config validator that checks rules against events.
+The "expensive" validation function is called repeatedly with the same inputs.
+
+This example shows three stages of optimization:
+1. Naive: call the function every time
+2. Memoized: cache results with @lru_cache
+3. Precomputed: since inputs are known ahead of time, build a lookup table
+
+EXERCISES:
+1. Run each version and compare times
+2. Profile each version - observe ncalls and cumtime
+3. Think about: when is precomputation better than memoization?
+"""
+
+import sys
+import time
+from functools import lru_cache
+
+
+# Simulated "expensive" validation function
+def validate_rule_slow(rule_id, event_type):
+    """
+    Simulate an expensive validation check.
+    In real life, this might query a database, parse XML, etc.
+    """
+    # Artificial delay to simulate expensive computation
+    total = 0
+    for i in range(10000):
+        total += (rule_id * event_type * i) % 997
+    return total % 2 == 0  # Returns True or False
+
+
+# The set of all valid (rule_id, event_type) pairs we'll encounter
+RULES = [1, 2, 3, 4, 5]
+EVENT_TYPES = [10, 20, 30, 40, 50]
+
+
+def process_events_naive(events):
+    """Process events using naive repeated validation."""
+    valid_count = 0
+    for rule_id, event_type, data in events:
+        if validate_rule_slow(rule_id, event_type):
+            valid_count += 1
+    return valid_count
+
+
+# Memoized version
+@lru_cache(maxsize=None)
+def validate_rule_cached(rule_id, event_type):
+    """Same validation but with caching."""
+    total = 0
+    for i in range(10000):
+        total += (rule_id * event_type * i) % 997
+    return total % 2 == 0
+
+
+def process_events_memoized(events):
+    """Process events using memoized validation."""
+    valid_count = 0
+    for rule_id, event_type, data in events:
+        if validate_rule_cached(rule_id, event_type):
+            valid_count += 1
+    return valid_count
+
+
+# Precomputed version
+def build_validation_table():
+    """
+    Build a lookup table for all possible (rule_id, event_type) combinations.
+    This is O(n*m) upfront but O(1) per lookup thereafter.
+    """
+    table = {}
+    for rule_id in RULES:
+        for event_type in EVENT_TYPES:
+            table[(rule_id, event_type)] = validate_rule_slow(rule_id, event_type)
+    return table
+
+
+VALIDATION_TABLE = None  # Lazy initialization
+
+
+def process_events_precomputed(events):
+    """Process events using precomputed lookup table."""
+    global VALIDATION_TABLE
+    if VALIDATION_TABLE is None:
+        VALIDATION_TABLE = build_validation_table()
+    
+    valid_count = 0
+    for rule_id, event_type, data in events:
+        if VALIDATION_TABLE[(rule_id, event_type)]:
+            valid_count += 1
+    return valid_count
+
+
+def generate_test_events(n):
+    """Generate n random test events."""
+    import random
+    random.seed(42)  # Reproducible
+    events = []
+    for i in range(n):
+        rule_id = random.choice(RULES)
+        event_type = random.choice(EVENT_TYPES)
+        data = f"event_{i}"
+        events.append((rule_id, event_type, data))
+    return events
+
+
+def benchmark(name, func, events):
+    """Run a function and report timing."""
+    start = time.perf_counter()
+    result = func(events)
+    elapsed = time.perf_counter() - start
+    print(f"{name:20s}: {elapsed:.3f}s (valid: {result})")
+    return elapsed
+
+
+def main():
+    n_events = 5000
+    if len(sys.argv) > 1:
+        n_events = int(sys.argv[1])
+    
+    print(f"Processing {n_events} events...")
+    print(f"Unique (rule, event_type) combinations: {len(RULES) * len(EVENT_TYPES)}")
+    print()
+    
+    events = generate_test_events(n_events)
+    
+    # Reset cached function for fair comparison
+    validate_rule_cached.cache_clear()
+    global VALIDATION_TABLE
+    VALIDATION_TABLE = None
+    
+    t_naive = benchmark("Naive", process_events_naive, events)
+    
+    validate_rule_cached.cache_clear()
+    t_memo = benchmark("Memoized", process_events_memoized, events)
+    
+    VALIDATION_TABLE = None
+    t_pre = benchmark("Precomputed", process_events_precomputed, events)
+    
+    print()
+    print(f"Speedup (memo vs naive):    {t_naive/t_memo:.1f}x")
+    print(f"Speedup (precomp vs naive): {t_naive/t_pre:.1f}x")
+    print(f"Speedup (precomp vs memo):  {t_memo/t_pre:.1f}x")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scenario2-memoization/fib_cached.py b/scenario2-memoization/fib_cached.py
new file mode 100644
index 0000000..e8c2c65
--- /dev/null
+++ b/scenario2-memoization/fib_cached.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+"""
+Scenario 2: Memoization with functools.lru_cache
+================================================
+Adding @lru_cache transforms O(2^n) into O(n) by caching results.
+
+EXERCISES:
+1. Run: time python3 fib_cached.py 35
+2. Compare to fib_slow.py - how much faster?
+3. Profile: python3 -m cProfile -s ncalls fib_cached.py 35
+4. Notice the dramatic reduction in call count
+5. Try a much larger number: python3 fib_cached.py 100
+"""
+
+import sys
+from functools import lru_cache
+
+
+@lru_cache(maxsize=None)  # Unlimited cache size
+def fib(n):
+    """Compute the nth Fibonacci number with memoization."""
+    if n <= 1:
+        return n
+    return fib(n - 1) + fib(n - 2)
+
+
+def main():
+    n = 35
+    if len(sys.argv) > 1:
+        n = int(sys.argv[1])
+    
+    print(f"Computing fib({n}) with memoization...")
+    result = fib(n)
+    print(f"fib({n}) = {result}")
+    
+    # Show cache statistics
+    print(f"\nCache info: {fib.cache_info()}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scenario2-memoization/fib_slow.py b/scenario2-memoization/fib_slow.py
new file mode 100644
index 0000000..0b192d3
--- /dev/null
+++ b/scenario2-memoization/fib_slow.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+"""
+Scenario 2: Hidden Redundancy - The Memoization Problem
+========================================================
+This program computes Fibonacci numbers recursively.
+The naive implementation has exponential time complexity due to redundant calls.
+
+EXERCISES:
+1. Run: time python3 fib_slow.py 35
+2. Profile: python3 -m cProfile -s ncalls fib_slow.py 35
+3. Notice the HUGE number of calls to fib()
+4. See fib_cached.py for the memoized version
+"""
+
+import sys
+
+
+def fib(n):
+    """Compute the nth Fibonacci number recursively."""
+    if n <= 1:
+        return n
+    return fib(n - 1) + fib(n - 2)
+
+
+def main():
+    n = 35  # Don't go much higher without optimization!
+    if len(sys.argv) > 1:
+        n = int(sys.argv[1])
+    
+    print(f"Computing fib({n})...")
+    result = fib(n)
+    print(f"fib({n}) = {result}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scenario3-syscall-storm/Makefile b/scenario3-syscall-storm/Makefile
new file mode 100644
index 0000000..8f707bd
--- /dev/null
+++ b/scenario3-syscall-storm/Makefile
@@ -0,0 +1,24 @@
+CC = gcc
+CFLAGS = -O2 -Wall
+
+all: read_slow read_fast read_stdio testfile
+
+read_slow: read_slow.c
+	$(CC) $(CFLAGS) -o $@ $<
+
+read_fast: read_fast.c
+	$(CC) $(CFLAGS) -o $@ $<
+
+read_stdio: read_stdio.c
+	$(CC) $(CFLAGS) -o $@ $<
+
+testfile:
+	dd if=/dev/urandom of=testfile bs=1M count=1
+
+smallfile:
+	dd if=/dev/urandom of=smallfile bs=10K count=1
+
+clean:
+	rm -f read_slow read_fast read_stdio testfile smallfile
+
+.PHONY: all clean testfile smallfile
diff --git a/scenario3-syscall-storm/README.md b/scenario3-syscall-storm/README.md
new file mode 100644
index 0000000..c71f855
--- /dev/null
+++ b/scenario3-syscall-storm/README.md
@@ -0,0 +1,141 @@
+# Scenario 3: The Syscall Storm
+
+## Learning Objectives
+- Understand the cost of syscalls (user-space to kernel-space transitions)
+- Use `strace -c` to count and profile syscalls
+- Learn why buffering matters for I/O performance
+- Understand `time` output: `real`, `user`, `sys`
+
+## Files
+- `read_slow.c` - Reads file byte-by-byte with raw `read()` syscalls
+- `read_fast.c` - Reads file in 64KB chunks
+- `read_stdio.c` - Uses stdio's `fgetc()` (internally buffered)
+- `read_python.py` - Python equivalents
+
+## Setup
+
+```bash
+# Compile all C programs and create test file
+make all
+```
+
+## Exercise 1: Observe the Problem
+
+### Step 1: Time the slow version
+```bash
+time ./read_slow testfile
+```
+
+Notice that `sys` time dominates - the CPU is mostly waiting for syscalls!
+
+### Step 2: Count the syscalls
+```bash
+strace -c ./read_slow testfile
+```
+
+Look at:
+- `calls` column for `read`: Should be ~1,000,000 (one per byte!)
+- `% time` column: Most time in `read`
+
+### Step 3: Compare with fast version
+```bash
+time ./read_fast testfile
+strace -c ./read_fast testfile
+```
+
+The `read` call count drops from ~1,000,000 to ~16 (1MB / 64KB).
+
+## Exercise 2: Understanding the Time Output
+
+```bash
+time ./read_slow testfile
+```
+
+Output explanation:
+- `real` - Wall clock time (what you'd measure with a stopwatch)
+- `user` - CPU time in user space (your code running)
+- `sys`  - CPU time in kernel space (syscalls, I/O)
+
+For `read_slow`: `sys` >> `user` because we spend most time in the kernel.
+For `read_fast`: `user` > `sys` because we spend more time processing data.
+
+## Exercise 3: strace Deep Dive
+
+### See individual syscalls (first 50)
+```bash
+strace -T ./read_slow testfile 2>&1 | head -50
+```
+
+The `-T` flag shows time spent in each syscall.
+
+### Filter to just read() calls
+```bash
+strace -e read -c ./read_slow testfile
+```
+
+## Exercise 4: stdio Comparison
+
+```bash
+time ./read_stdio testfile
+strace -c ./read_stdio testfile
+```
+
+Questions:
+- How many `read` syscalls does stdio make?
+- Why is it still slower than `read_fast`?
+- Hint: Check the default stdio buffer size (usually 4KB or 8KB)
+
+## Exercise 5: Python I/O
+
+```bash
+# Create smaller file for unbuffered test
+make smallfile
+
+# Run Python comparison
+python3 read_python.py smallfile
+
+# Profile with strace
+strace -c python3 read_python.py smallfile
+```
+
+## Key Insights
+
+### Why are syscalls expensive?
+1. **Context switch**: CPU saves user state, loads kernel state
+2. **Security checks**: Kernel validates permissions
+3. **Memory barriers**: Caches may need flushing
+4. **Scheduling**: Kernel may switch to another process
+
+### The buffering solution
+Instead of:
+```
+read(1 byte) → kernel → read(1 byte) → kernel → ... (million times)
+```
+
+We do:
+```
+read(64KB) → kernel → process 64KB in user space → read(64KB) → ...
+```
+
+### Rule of thumb
+- Syscall overhead: ~100-1000 nanoseconds
+- Reading 64KB: ~10-100 microseconds (from cache/RAM)
+- Break-even: buffer should be at least a few KB
+
+## perf stat Comparison
+
+```bash
+perf stat ./read_slow testfile
+perf stat ./read_fast testfile
+```
+
+Look at:
+- `context-switches`
+- `cpu-migrations`
+- `instructions per cycle`
+
+## Further Exploration
+
+1. What happens with `read(fd, buf, 4096)` vs `read(fd, buf, 65536)`?
+2. How does `mmap()` compare? (Memory-mapped I/O)
+3. What about `O_DIRECT` flag? (Bypass page cache)
diff --git a/scenario3-syscall-storm/read_fast.c b/scenario3-syscall-storm/read_fast.c
new file mode 100644
index 0000000..dbdf668
--- /dev/null
+++ b/scenario3-syscall-storm/read_fast.c
@@ -0,0 +1,52 @@
+/*
+ * Scenario 3: Buffered I/O - The Fix
+ * ===================================
+ * This program reads a file in large chunks, dramatically reducing syscalls.
+ *
+ * Compile: gcc -O2 -o read_fast read_fast.c
+ *
+ * EXERCISES:
+ * 1. Run: time ./read_fast testfile
+ * 2. Compare: strace -c ./read_fast testfile
+ * 3. Notice the ~1000x reduction in syscalls
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#define BUFFER_SIZE 65536  /* 64KB buffer */
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s <filename>\n", argv[0]);
+        return 1;
+    }
+
+    int fd = open(argv[1], O_RDONLY);
+    if (fd < 0) {
+        perror("open");
+        return 1;
+    }
+
+    char buffer[BUFFER_SIZE];
+    unsigned long byte_count = 0;
+    unsigned long checksum = 0;
+    ssize_t bytes_read;
+
+    /* Read in large chunks - much better! */
+    while ((bytes_read = read(fd, buffer, BUFFER_SIZE)) > 0) {
+        for (ssize_t i = 0; i < bytes_read; i++) {
+            checksum += (unsigned char)buffer[i];
+        }
+        byte_count += bytes_read;
+    }
+
+    close(fd);
+
+    printf("Read %lu bytes\n", byte_count);
+    printf("Checksum: %lu\n", checksum);
+
+    return 0;
+}
diff --git a/scenario3-syscall-storm/read_python.py b/scenario3-syscall-storm/read_python.py
new file mode 100644
index 0000000..7781030
--- /dev/null
+++ b/scenario3-syscall-storm/read_python.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""
+Scenario 3: Syscall Storm in Python
+====================================
+This demonstrates buffered vs unbuffered I/O in Python.
+
+EXERCISES:
+1. Create test file: dd if=/dev/urandom of=testfile bs=1M count=1
+2. Run: python3 read_python.py testfile
+3. Profile: strace -c python3 read_python.py testfile
+"""
+
+import sys
+import os
+import time
+
+
+def read_unbuffered(filename):
+    """Read file byte-by-byte using os.read (raw syscalls)."""
+    fd = os.open(filename, os.O_RDONLY)
+    byte_count = 0
+    checksum = 0
+    
+    while True:
+        data = os.read(fd, 1)  # Read 1 byte at a time!
+        if not data:
+            break
+        byte_count += 1
+        checksum += data[0]
+    
+    os.close(fd)
+    return byte_count, checksum
+
+
+def read_buffered(filename):
+    """Read file using Python's buffered I/O."""
+    byte_count = 0
+    checksum = 0
+    
+    with open(filename, 'rb') as f:
+        while True:
+            # Python's file object is buffered internally
+            data = f.read(65536)  # Read 64KB chunks
+            if not data:
+                break
+            byte_count += len(data)
+            checksum += sum(data)
+    
+    return byte_count, checksum
+
+
+def read_byte_by_byte_buffered(filename):
+    """
+    Read byte-by-byte but through Python's buffered file object.
+    This is slow in Python but not due to syscalls!
+    """
+    byte_count = 0
+    checksum = 0
+    
+    with open(filename, 'rb') as f:
+        while True:
+            data = f.read(1)  # Looks like 1 byte, but file is buffered
+            if not data:
+                break
+            byte_count += 1
+            checksum += data[0]
+    
+    return byte_count, checksum
+
+
+def benchmark(name, func, filename):
+    """Run and time a function."""
+    start = time.perf_counter()
+    byte_count, checksum = func(filename)
+    elapsed = time.perf_counter() - start
+    print(f"{name:30s}: {elapsed:.3f}s ({byte_count} bytes, checksum={checksum})")
+    return elapsed
+
+
+def main():
+    if len(sys.argv) != 2:
+        print(f"Usage: {sys.argv[0]} <filename>")
+        print("\nCreate a test file with:")
+        print("  dd if=/dev/urandom of=testfile bs=1M count=1")
+        sys.exit(1)
+    
+    filename = sys.argv[1]
+    
+    print("Reading file with different strategies...\n")
+    
+    t_buf = benchmark("Buffered (64KB chunks)", read_buffered, filename)
+    t_byte_buf = benchmark("Byte-by-byte (buffered file)", read_byte_by_byte_buffered, filename)
+    
+    # Only run unbuffered test if file is small (< 100KB)
+    file_size = os.path.getsize(filename)
+    if file_size < 100_000:
+        t_unbuf = benchmark("Unbuffered (raw syscalls)", read_unbuffered, filename)
+        print(f"\nSpeedup (buffered vs unbuffered): {t_unbuf/t_buf:.1f}x")
+    else:
+        print(f"\nSkipping unbuffered test (file too large: {file_size} bytes)")
+        print("For unbuffered test, create smaller file: dd if=/dev/urandom of=smallfile bs=10K count=1")
+        print("Then run: strace -c python3 read_python.py smallfile")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scenario3-syscall-storm/read_slow.c b/scenario3-syscall-storm/read_slow.c
new file mode 100644
index 0000000..226a2b1
--- /dev/null
+++ b/scenario3-syscall-storm/read_slow.c
@@ -0,0 +1,49 @@
+/*
+ * Scenario 3: The Syscall Storm - Unbuffered I/O
+ * ===============================================
+ * This program reads a file byte-by-byte using raw read() syscalls.
+ * Each byte triggers a context switch to the kernel - extremely slow!
+ *
+ * Compile: gcc -O2 -o read_slow read_slow.c
+ * 
+ * EXERCISES:
+ * 1. Create a test file: dd if=/dev/urandom of=testfile bs=1M count=1
+ * 2. Run: time ./read_slow testfile
+ * 3. Profile: strace -c ./read_slow testfile
+ * 4. Compare with read_fast.c
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s <filename>\n", argv[0]);
+        return 1;
+    }
+
+    int fd = open(argv[1], O_RDONLY);
+    if (fd < 0) {
+        perror("open");
+        return 1;
+    }
+
+    char c;
+    unsigned long byte_count = 0;
+    unsigned long checksum = 0;
+
+    /* Read one byte at a time - TERRIBLE for performance! */
+    while (read(fd, &c, 1) == 1) {
+        byte_count++;
+        checksum += (unsigned char)c;
+    }
+
+    close(fd);
+
+    printf("Read %lu bytes\n", byte_count);
+    printf("Checksum: %lu\n", checksum);
+
+    return 0;
+}
diff --git a/scenario3-syscall-storm/read_stdio.c b/scenario3-syscall-storm/read_stdio.c
new file mode 100644
index 0000000..da22694
--- /dev/null
+++ b/scenario3-syscall-storm/read_stdio.c
@@ -0,0 +1,47 @@
+/*
+ * Scenario 3: stdio Buffering
+ * ===========================
+ * This version uses fgetc() which is buffered by stdio.
+ * It reads byte-by-byte in C code, but stdio buffers internally.
+ *
+ * Compile: gcc -O2 -o read_stdio read_stdio.c
+ *
+ * EXERCISES:
+ * 1. Run: time ./read_stdio testfile
+ * 2. Compare: strace -c ./read_stdio testfile
+ * 3. Notice fewer syscalls than read_slow, but more than read_fast
+ * 4. Why? Default stdio buffer is ~4KB, we use 64KB in read_fast
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s <filename>\n", argv[0]);
+        return 1;
+    }
+
+    FILE *fp = fopen(argv[1], "rb");
+    if (!fp) {
+        perror("fopen");
+        return 1;
+    }
+
+    int c;
+    unsigned long byte_count = 0;
+    unsigned long checksum = 0;
+
+    /* fgetc is buffered internally by stdio */
+    while ((c = fgetc(fp)) != EOF) {
+        byte_count++;
+        checksum += (unsigned char)c;
+    }
+
+    fclose(fp);
+
+    printf("Read %lu bytes\n", byte_count);
+    printf("Checksum: %lu\n", checksum);
+
+    return 0;
+}
diff --git a/scenario3-syscall-storm/testfile b/scenario3-syscall-storm/testfile
new file mode 100644
index 0000000..5745cc8
Binary files /dev/null and b/scenario3-syscall-storm/testfile differ
diff --git a/scenario4-cache-misses/Makefile b/scenario4-cache-misses/Makefile
new file mode 100644
index 0000000..37dd626
--- /dev/null
+++ b/scenario4-cache-misses/Makefile
@@ -0,0 +1,15 @@
+CC = gcc
+CFLAGS = -O2 -Wall
+
+all: cache_demo list_vs_array
+
+cache_demo: cache_demo.c
+	$(CC) $(CFLAGS) -o $@ $<
+
+list_vs_array: list_vs_array.c
+	$(CC) $(CFLAGS) -o $@ $<
+
+clean:
+	rm -f cache_demo list_vs_array
+
+.PHONY: all clean
diff --git a/scenario4-cache-misses/README.md b/scenario4-cache-misses/README.md
new file mode 100644
index 0000000..d5393cb
--- /dev/null
+++ b/scenario4-cache-misses/README.md
@@ -0,0 +1,148 @@
+# Scenario 4: Cache Misses and Memory Access Patterns
+
+## Learning Objectives
+- Understand CPU cache basics (L1, L2, L3)
+- Use `perf stat` to measure cache behavior
+- Recognize cache-friendly vs cache-hostile access patterns
+- Understand why Big-O notation doesn't tell the whole story
+
+## Background: How CPU Caches Work
+
+```
+CPU Core
+    ↓
+L1 Cache (~32KB, ~4 cycles)
+    ↓
+L2 Cache (~256KB, ~12 cycles)
+    ↓
+L3 Cache (~8MB, ~40 cycles)
+    ↓
+Main RAM (~64GB, ~200 cycles)
+```
+
+Key concepts:
+- **Cache line**: Data is loaded in chunks (typically 64 bytes)
+- **Spatial locality**: If you access byte N, bytes N+1, N+2, ... are likely already cached
+- **Temporal locality**: Recently accessed data is likely to be accessed again
+
+## Files
+- `cache_demo.c` - Row-major vs column-major 2D array traversal
+- `list_vs_array.c` - Array vs linked list traversal
+
+## Exercise 1: Row vs Column Major
+
+### Step 1: Build and run
+```bash
+make cache_demo
+./cache_demo
+```
+
+You should see column-major is significantly slower (often 3-10x).
+
+### Step 2: Measure cache misses
+```bash
+perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo
+```
+
+Compare the cache miss counts and ratios.
+
+### Why does this happen?
+
+C stores 2D arrays in **row-major** order:
+```
+Memory: [0][0] [0][1] [0][2] ... [0][COLS-1] [1][0] [1][1] ...
+         ←————— row 0 ——————→    ←—— row 1 ——→
+```
+
+**Row-major access**: Sequential in memory → cache lines are fully utilized
+```
+Access: [0][0] [0][1] [0][2] [0][3] ...
+Cache:  [████████████████] ← one cache line serves 16 ints
+```
+
+**Column-major access**: Jumping by COLS * sizeof(int) bytes each time
+```
+Access: [0][0] [1][0] [2][0] [3][0] ...
+Cache:  [█_______________] ← load entire line, use 1 int, evict
+        [█_______________] ← repeat for each access
+```
+
+## Exercise 2: Array vs Linked List
+
+### Step 1: Build and run
+```bash
+make list_vs_array
+./list_vs_array
+```
+
+### Step 2: Measure cache behavior
+```bash
+perf stat -e cache-misses,cache-references ./list_vs_array
+```
+
+### Three cases compared:
+
+| Case | Memory Layout | Cache Behavior |
+|------|---------------|----------------|
+| Array | Contiguous | Excellent - prefetcher wins |
+| List (sequential) | Contiguous (lucky!) | Good - nodes happen to be adjacent |
+| List (scattered) | Random | Terrible - every access misses |
+
+### Why "sequential list" is still slower than array:
+
+1. **Pointer chasing**: CPU can't prefetch next element (doesn't know address)
+2. **Larger elements**: `struct node` is bigger than `int` (includes pointer)
+3. **Indirect access**: Extra memory load for the `next` pointer
+
+## Exercise 3: Deeper perf Analysis
+
+### See more cache events
+```bash
+perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./cache_demo
+```
+
+Events explained:
+- `L1-dcache-*`: Level 1 data cache (fastest, smallest)
+- `LLC-*`: Last Level Cache (L3, slowest cache before RAM)
+- `cycles`: Total CPU cycles
+- `instructions`: Total instructions executed
+- IPC (instructions per cycle): Higher is better
+
+### Profile with perf record
+```bash
+perf record -e cache-misses ./cache_demo
+perf report
+```
+
+This shows which functions cause the most cache misses.
+
+## Discussion Questions
+
+1. **Why doesn't the compiler fix this?**
+   - Compilers can sometimes interchange loops, but:
+   - Side effects may prevent it
+   - Aliasing makes it unsafe to assume
+   - The programmer often knows better
+
+2. **How big does the array need to be to see this effect?**
+   - If array fits in L1 cache: No difference
+   - If array fits in L3 cache: Moderate difference
+   - If array exceeds L3 cache: Dramatic difference
+
+3. **What about multithreaded code?**
+   - False sharing: Different threads accessing same cache line
+   - Cache coherency traffic between cores
+
+## Real-World Implications
+
+- **Image processing**: Process row-by-row, not column-by-column
+- **Matrix operations**: Libraries like BLAS use cache-blocking
+- **Data structures**: Arrays often beat linked lists in practice
+- **Database design**: Row stores vs column stores
+
+## Key Takeaways
+
+1. **Memory access pattern matters as much as algorithm complexity**
+2. **Sequential access is almost always faster than random access**
+3. **Measure with `perf stat` before optimizing**
+4. **Big-O notation hides constant factors that can be 10-100x**
diff --git a/scenario4-cache-misses/cache_demo.c b/scenario4-cache-misses/cache_demo.c
new file mode 100644
index 0000000..f1887c1
--- /dev/null
+++ b/scenario4-cache-misses/cache_demo.c
@@ -0,0 +1,109 @@
+/*
+ * Scenario 4: Cache Misses - Memory Access Patterns
+ * ==================================================
+ * This program demonstrates the performance impact of memory access patterns.
+ * Row-major vs column-major traversal of a 2D array.
+ *
+ * Compile: gcc -O2 -o cache_demo cache_demo.c
+ * 
+ * EXERCISES:
+ * 1. Run: ./cache_demo
+ * 2. Profile: perf stat -e cache-misses,cache-references ./cache_demo
+ * 3. Why is one so much faster?
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+
+#define ROWS 8192
+#define COLS 8192
+
+/* 
+ * Global array to ensure it's not optimized away.
+ * This is a 64MB array (8192 * 8192 * sizeof(int) = 256MB if int is 4 bytes)
+ * Wait, that's too big. Let's use smaller dimensions or chars.
+ */
+
+/* Using static to avoid stack overflow */
+static int matrix[ROWS][COLS];
+
+double get_time(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec + ts.tv_nsec / 1e9;
+}
+
+long sum_row_major(void) {
+    /*
+     * Row-major traversal: access sequential memory addresses
+     * Memory layout: [0][0], [0][1], [0][2], ... [0][COLS-1], [1][0], ...
+     * This matches how C stores 2D arrays - CACHE FRIENDLY
+     */
+    long sum = 0;
+    for (int i = 0; i < ROWS; i++) {
+        for (int j = 0; j < COLS; j++) {
+            sum += matrix[i][j];
+        }
+    }
+    return sum;
+}
+
+long sum_col_major(void) {
+    /*
+     * Column-major traversal: jump around in memory
+     * Access pattern: [0][0], [1][0], [2][0], ... [ROWS-1][0], [0][1], ...
+     * Each access is COLS * sizeof(int) bytes apart - CACHE HOSTILE
+     */
+    long sum = 0;
+    for (int j = 0; j < COLS; j++) {
+        for (int i = 0; i < ROWS; i++) {
+            sum += matrix[i][j];
+        }
+    }
+    return sum;
+}
+
+void init_matrix(void) {
+    /* Initialize with some values */
+    for (int i = 0; i < ROWS; i++) {
+        for (int j = 0; j < COLS; j++) {
+            matrix[i][j] = (i + j) % 100;
+        }
+    }
+}
+
+int main(void) {
+    printf("Matrix size: %d x %d = %zu bytes\n", 
+           ROWS, COLS, sizeof(matrix));
+    printf("Cache line size (typical): 64 bytes\n");
+    printf("Stride in column-major: %zu bytes\n\n", COLS * sizeof(int));
+
+    init_matrix();
+
+    double start, elapsed;
+    long result;
+
+    /* Warm up */
+    result = sum_row_major();
+    result = sum_col_major();
+
+    /* Row-major benchmark */
+    start = get_time();
+    result = sum_row_major();
+    elapsed = get_time() - start;
+    printf("Row-major sum:    %ld in %.3f seconds\n", result, elapsed);
+
+    /* Column-major benchmark */
+    start = get_time();
+    result = sum_col_major();
+    elapsed = get_time() - start;
+    printf("Column-major sum: %ld in %.3f seconds\n", result, elapsed);
+
+    printf("\n");
+    printf("To see cache misses, run:\n");
+    printf("  perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo\n");
+
+    return 0;
+}
diff --git a/scenario4-cache-misses/list_vs_array.c b/scenario4-cache-misses/list_vs_array.c
new file mode 100644
index 0000000..4c370e7
--- /dev/null
+++ b/scenario4-cache-misses/list_vs_array.c
@@ -0,0 +1,175 @@
+/*
+ * Scenario 4b: Array vs Linked List Traversal
+ * ============================================
+ * Arrays have excellent cache locality; linked lists do not.
+ * This demonstrates why "O(n) vs O(n)" can have very different constants.
+ *
+ * Compile: gcc -O2 -o list_vs_array list_vs_array.c
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define N 10000000  /* 10 million elements */
+
+struct node {
+    int value;
+    struct node *next;
+};
+
+double get_time(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec + ts.tv_nsec / 1e9;
+}
+
+/* Sum array elements */
+long sum_array(int *arr, int n) {
+    long sum = 0;
+    for (int i = 0; i < n; i++) {
+        sum += arr[i];
+    }
+    return sum;
+}
+
+/* Sum linked list elements */
+long sum_list(struct node *head) {
+    long sum = 0;
+    struct node *curr = head;
+    while (curr != NULL) {
+        sum += curr->value;
+        curr = curr->next;
+    }
+    return sum;
+}
+
+/* Create array */
+int *create_array(int n) {
+    int *arr = malloc(n * sizeof(int));
+    if (!arr) {
+        perror("malloc array");
+        exit(1);
+    }
+    for (int i = 0; i < n; i++) {
+        arr[i] = i % 100;
+    }
+    return arr;
+}
+
+/* Create linked list - nodes allocated sequentially (best case for list) */
+struct node *create_list_sequential(int n) {
+    struct node *nodes = malloc(n * sizeof(struct node));
+    if (!nodes) {
+        perror("malloc list");
+        exit(1);
+    }
+    
+    for (int i = 0; i < n - 1; i++) {
+        nodes[i].value = i % 100;
+        nodes[i].next = &nodes[i + 1];
+    }
+    nodes[n - 1].value = (n - 1) % 100;
+    nodes[n - 1].next = NULL;
+    
+    return nodes;
+}
+
+/* Create linked list - nodes allocated randomly (worst case for cache) */
+struct node *create_list_scattered(int n) {
+    /* Allocate nodes individually to scatter them in memory */
+    struct node **nodes = malloc(n * sizeof(struct node *));
+    if (!nodes) {
+        perror("malloc");
+        exit(1);
+    }
+    
+    /* Allocate each node separately */
+    for (int i = 0; i < n; i++) {
+        nodes[i] = malloc(sizeof(struct node));
+        if (!nodes[i]) {
+            perror("malloc node");
+            exit(1);
+        }
+        nodes[i]->value = i % 100;
+    }
+    
+    /* Shuffle the order (Fisher-Yates) */
+    srand(42);
+    for (int i = n - 1; i > 0; i--) {
+        int j = rand() % (i + 1);
+        struct node *tmp = nodes[i];
+        nodes[i] = nodes[j];
+        nodes[j] = tmp;
+    }
+    
+    /* Link them in shuffled order */
+    for (int i = 0; i < n - 1; i++) {
+        nodes[i]->next = nodes[i + 1];
+    }
+    nodes[n - 1]->next = NULL;
+    
+    struct node *head = nodes[0];
+    free(nodes);  /* Free the pointer array, not the nodes */
+    return head;
+}
+
+void free_scattered_list(struct node *head) {
+    while (head != NULL) {
+        struct node *next = head->next;
+        free(head);
+        head = next;
+    }
+}
+
+int main(void) {
+    printf("Comparing array vs linked list traversal (%d elements)\n\n", N);
+
+    double start, elapsed;
+    long result;
+
+    /* Array */
+    printf("Creating array...\n");
+    int *arr = create_array(N);
+    
+    start = get_time();
+    result = sum_array(arr, N);
+    elapsed = get_time() - start;
+    printf("Array sum:             %ld in %.4f seconds\n", result, elapsed);
+    double array_time = elapsed;
+    free(arr);
+
+    /* Sequential linked list (best case for list) */
+    printf("\nCreating sequential linked list...\n");
+    struct node *list_seq = create_list_sequential(N);
+    
+    start = get_time();
+    result = sum_list(list_seq);
+    elapsed = get_time() - start;
+    printf("List sum (sequential): %ld in %.4f seconds\n", result, elapsed);
+    double list_seq_time = elapsed;
+    free(list_seq);
+
+    /* Scattered linked list (worst case for cache) */
+    printf("\nCreating scattered linked list (this takes a while)...\n");
+    struct node *list_scat = create_list_scattered(N);
+    
+    start = get_time();
+    result = sum_list(list_scat);
+    elapsed = get_time() - start;
+    printf("List sum (scattered):  %ld in %.4f seconds\n", result, elapsed);
+    double list_scat_time = elapsed;
+    free_scattered_list(list_scat);
+
+    printf("\n--- Summary ---\n");
+    printf("Array:              %.4fs (baseline)\n", array_time);
+    printf("List (sequential):  %.4fs (%.1fx slower)\n", 
+           list_seq_time, list_seq_time / array_time);
+    printf("List (scattered):   %.4fs (%.1fx slower)\n", 
+           list_scat_time, list_scat_time / array_time);
+
+    printf("\nTo see cache behavior:\n");
+    printf("  perf stat -e cache-misses,cache-references ./list_vs_array\n");
+
+    return 0;
+}
diff --git a/scenario5-debug-symbols/Makefile b/scenario5-debug-symbols/Makefile
new file mode 100644
index 0000000..75dbc28
--- /dev/null
+++ b/scenario5-debug-symbols/Makefile
@@ -0,0 +1,40 @@
+CC = gcc
+CFLAGS_BASE = -O2 -Wall
+
+all: nodebug withdebug
+
+nodebug: program.c
+	$(CC) $(CFLAGS_BASE) -o $@ $< -lm
+
+withdebug: program.c
+	$(CC) $(CFLAGS_BASE) -g -o $@ $< -lm
+
+# Show the size difference
+sizes: nodebug withdebug
+	@echo "File sizes:"
+	@ls -lh nodebug withdebug
+	@echo ""
+	@echo "Section sizes (nodebug):"
+	@size nodebug
+	@echo ""
+	@echo "Section sizes (withdebug):"
+	@size withdebug
+
+# Show symbols
+symbols: nodebug withdebug
+	@echo "Symbols in nodebug:"
+	@nm nodebug | grep -E "compute|process|main" || true
+	@echo ""
+	@echo "Symbols in withdebug:"
+	@nm withdebug | grep -E "compute|process|main" || true
+
+# Strip the debug version to show what happens
+stripped: withdebug
+	cp withdebug stripped
+	strip stripped
+	@echo "Stripped file created"
+
+clean:
+	rm -f nodebug withdebug stripped perf.data perf.data.old
+
+.PHONY: all sizes symbols clean stripped
diff --git a/scenario5-debug-symbols/README.md b/scenario5-debug-symbols/README.md
new file mode 100644
index 0000000..fbb0659
--- /dev/null
+++ b/scenario5-debug-symbols/README.md
@@ -0,0 +1,179 @@
+# Scenario 5: Debug Symbols - The Missing Map
+
+## Learning Objectives
+- Understand what debug symbols are and why they matter
+- Compare profiling output with and without symbols
+- Use `perf annotate` to see source-level hotspots
+- Understand the trade-offs of shipping debug symbols
+
+## Background
+
+When you compile C code, the compiler translates your source into machine code.
+By default, the connection between source lines and machine instructions is lost.
+
+**Debug symbols** (enabled with `-g`) preserve this mapping:
+- Function names
+- Source file names and line numbers
+- Variable names and types
+- Inline function information
+
+## Files
+- `program.c` - A program with nested function calls
+- `Makefile` - Builds `nodebug` and `withdebug` versions
+
+## Exercise 1: Build Both Versions
+
+```bash
+make all
+```
+
+Compare file sizes:
+```bash
+make sizes
+```
+
+The `withdebug` binary is larger due to DWARF debug sections.
+
+## Exercise 2: Profile Without Debug Symbols
+
+```bash
+perf record ./nodebug 500 5000
+perf report
+```
+
+You'll see something like:
+```
+  45.23%  nodebug  nodebug    [.] 0x0000000000001234
+  32.11%  nodebug  nodebug    [.] 0x0000000000001456
+  12.45%  nodebug  libm.so    [.] __sin_fma
+```
+
+The hex addresses (`0x...`) tell you nothing useful!
+
+## Exercise 3: Profile With Debug Symbols
+
+```bash
+perf record ./withdebug 500 5000
+perf report
+```
+
+Now you see:
+```
+  45.23%  withdebug  withdebug  [.] compute_inner
+  32.11%  withdebug  withdebug  [.] compute_middle
+  12.45%  withdebug  libm.so    [.] __sin_fma
+```
+
+Much better! You can see which functions are hot.
+
+## Exercise 4: Source-Level Annotation
+
+With debug symbols, you can see exactly which lines are hot:
+
+```bash
+perf record ./withdebug 500 5000
+perf annotate compute_inner
+```
+
+This shows the source code with cycle counts per line!
+
+```
+       │    double compute_inner(double x, int iterations) {
+       │        double result = x;
+  1.23 │        for (int i = 0; i < iterations; i++) {
+ 45.67 │            result = sin(result) * cos(result) + sqrt(fabs(result));
+ 23.45 │            result = result * 1.0001 + 0.0001;
+       │        }
+       │        return result;
+       │    }
+```
+
+## Exercise 5: Understanding Symbol Tables
+
+```bash
+# Show symbols in each binary
+make symbols
+
+# Or manually:
+nm nodebug | head -20
+nm withdebug | head -20
+
+# Show more detail about sections
+readelf -S withdebug | grep debug
+```
+
+Debug sections you might see:
+- `.debug_info` - Type information
+- `.debug_line` - Line number tables
+- `.debug_str` - String table
+- `.debug_abbrev` - Abbreviation tables
+
+## Exercise 6: Stripping Symbols
+
+Production binaries are often "stripped" to reduce size:
+
+```bash
+make stripped
+ls -l withdebug stripped
+
+# Try to profile the stripped binary
+perf record ./stripped 500 5000
+perf report
+```
+
+The stripped binary loses symbol information too!
+
+## Exercise 7: Separate Debug Files
+
+In production, you can ship stripped binaries but keep debug info separate:
+
+```bash
+# Extract debug info to separate file
+objcopy --only-keep-debug withdebug withdebug.debug
+
+# Strip the binary
+strip withdebug -o withdebug.stripped
+
+# Add a link to the debug file
+objcopy --add-gnu-debuglink=withdebug.debug withdebug.stripped
+
+# Now perf can find the debug info
+perf record ./withdebug.stripped 500 5000
+perf report
+```
+
+This is how Linux distros provide `-dbg` or `-debuginfo` packages.
+
+## Discussion Questions
+
+1. **Why don't we always compile with `-g`?**
+   - Binary size (can be 2-10x larger)
+   - Exposes source structure (security/IP concerns)
+   - Some optimizations may be affected (though `-O2 -g` works well)
+
+2. **Does `-g` affect performance?**
+   - Generally no: debug info is stored in separate sections
+   - Not loaded unless a debugger attaches
+   - Some edge cases with frame pointers
+
+3. **What about release vs debug builds?**
+   - Debug build: `-O0 -g` (no optimization, full debug)
+   - Release build: `-O2 -g` (optimized, with symbols)
+   - Stripped release: `-O2` then `strip`
+
+## Key Takeaways
+
+1. **Always compile with `-g` during development**
+2. **Debug symbols don't meaningfully affect runtime performance**
+3. **Without symbols, profilers show useless hex addresses**
+4. **Production: ship stripped binaries, keep debug files for crash analysis**
+
+## Bonus: Flamegraph Generation
+
+```bash
+# Record with call graph
+perf record -g ./withdebug 500 5000
+
+# Generate flamegraph (requires FlameGraph scripts)
+perf script | /path/to/FlameGraph/stackcollapse-perf.pl | /path/to/FlameGraph/flamegraph.pl > profile.svg
+```
diff --git a/scenario5-debug-symbols/program.c b/scenario5-debug-symbols/program.c
new file mode 100644
index 0000000..c5a23b3
--- /dev/null
+++ b/scenario5-debug-symbols/program.c
@@ -0,0 +1,88 @@
+/*
+ * Scenario 5: Debug Symbols - The Missing Map
+ * ============================================
+ * This program has multiple functions calling each other.
+ * We'll compile it with and without debug symbols to show the difference.
+ *
+ * Compile WITHOUT debug symbols:
+ *   gcc -O2 -o nodebug program.c
+ *
+ * Compile WITH debug symbols:
+ *   gcc -O2 -g -o withdebug program.c
+ *
+ * EXERCISES:
+ * 1. perf record ./nodebug && perf report  (see hex addresses)
+ * 2. perf record ./withdebug && perf report  (see function names)
+ * 3. perf annotate -s compute_inner  (see source code!)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+/* Deliberately NOT inlined to show in profiler */
+__attribute__((noinline))
+double compute_inner(double x, int iterations) {
+    double result = x;
+    for (int i = 0; i < iterations; i++) {
+        result = sin(result) * cos(result) + sqrt(fabs(result));
+        result = result * 1.0001 + 0.0001;
+    }
+    return result;
+}
+
+__attribute__((noinline))
+double compute_middle(double x, int iterations) {
+    double sum = 0.0;
+    for (int i = 0; i < 10; i++) {
+        sum += compute_inner(x + i, iterations);
+    }
+    return sum;
+}
+
+__attribute__((noinline))
+double compute_outer(int n, int iterations) {
+    double total = 0.0;
+    for (int i = 0; i < n; i++) {
+        total += compute_middle((double)i / n, iterations);
+    }
+    return total;
+}
+
+__attribute__((noinline))
+void process_data(int *data, int size) {
+    /* Some additional work to show in the profile */
+    long sum = 0;
+    for (int i = 0; i < size; i++) {
+        for (int j = 0; j < 100; j++) {
+            sum += data[i] * j;
+        }
+    }
+    printf("Data checksum: %ld\n", sum % 10000);
+}
+
+int main(int argc, char *argv[]) {
+    int n = 1000;
+    int iterations = 10000;
+    
+    if (argc > 1) n = atoi(argv[1]);
+    if (argc > 2) iterations = atoi(argv[2]);
+    
+    printf("Running with n=%d, iterations=%d\n", n, iterations);
+    
+    /* Allocate some data */
+    int *data = malloc(n * sizeof(int));
+    for (int i = 0; i < n; i++) {
+        data[i] = i * 17 + 3;
+    }
+    
+    /* Do some computation */
+    double result = compute_outer(n, iterations);
+    printf("Computation result: %f\n", result);
+    
+    /* Process data */
+    process_data(data, n);
+    
+    free(data);
+    return 0;
+}
diff --git a/scenario6-usdt-probes/Makefile b/scenario6-usdt-probes/Makefile
new file mode 100644
index 0000000..e264aa0
--- /dev/null
+++ b/scenario6-usdt-probes/Makefile
@@ -0,0 +1,35 @@
+CC = gcc
+CFLAGS = -O2 -g -Wall
+
+# Check if sys/sdt.h is available
+SDT_CHECK := $(shell echo '\#include <sys/sdt.h>' | $(CC) -E - >/dev/null 2>&1 && echo yes)
+
+ifeq ($(SDT_CHECK),yes)
+    CFLAGS += -DHAVE_SDT
+    SDT_STATUS = "USDT probes ENABLED"
+else
+    SDT_STATUS = "USDT probes DISABLED (install systemtap-sdt-dev)"
+endif
+
+all: server
+	@echo $(SDT_STATUS)
+
+server: server.c
+	$(CC) $(CFLAGS) -o $@ $<
+
+# List the probes embedded in the binary
+list-probes: server
+	@echo "Probes in binary (requires USDT support):"
+	readelf -n server 2>/dev/null | grep -A2 "stapsdt" || echo "No probes found"
+
+# Alternative: use perf to list probes
+perf-list: server
+	perf probe -x ./server --list 2>/dev/null || echo "No probes found or perf not supported"
+
+clean:
+	rm -f server
+
+install-deps:
+	sudo apt install systemtap-sdt-dev bpftrace
+
+.PHONY: all clean list-probes perf-list install-deps
diff --git a/scenario6-usdt-probes/README.md b/scenario6-usdt-probes/README.md
new file mode 100644
index 0000000..2da18a6
--- /dev/null
+++ b/scenario6-usdt-probes/README.md
@@ -0,0 +1,184 @@
+# Scenario 6: USDT Probes - Custom Instrumentation
+
+## Learning Objectives
+- Understand the difference between static and dynamic probes
+- Add USDT probes to C code
+- Trace probes with bpftrace
+- See how probes enable production debugging
+
+## Background
+
+**Dynamic probes** (like `perf probe`): Added at runtime, can break on any function
+**Static probes** (USDT): Compiled into the binary, designed by the developer
+
+USDT probes are:
+- Nearly zero overhead when not traced
+- Stable API for debuggers/tracers
+- Self-documenting: probe names describe what's happening
+- Used by Python, Ruby, MySQL, PostgreSQL, and many other projects
+
+## Prerequisites
+
+```bash
+# Install USDT support and bpftrace
+sudo apt install systemtap-sdt-dev bpftrace
+```
+
+## Files
+- `server.c` - Simulated server with USDT probes at key points
+
+## Exercise 1: Build and Run
+
+```bash
+make
+./server 3 10
+```
+
+This processes 3 batches of 10 requests each.
+
+## Exercise 2: List the Probes
+
+```bash
+# Using readelf
+readelf -n ./server | grep -A2 stapsdt
+
+# Or using perf
+sudo perf probe -x ./server --list
+```
+
+You should see probes like:
+- `myserver:server_start`
+- `myserver:request_start`
+- `myserver:request_end`
+- `myserver:request_error`
+
+## Exercise 3: Trace with bpftrace
+
+### Count requests by type
+```bash
+# In terminal 1:
+sudo bpftrace -e '
+usdt:./server:myserver:request_start {
+    @types[arg1] = count();
+}
+END {
+    print(@types);
+}
+'
+
+# In terminal 2:
+./server 5 50
+```
+
+### Measure request latency
+```bash
+sudo bpftrace -e '
+usdt:./server:myserver:request_start {
+    @start[arg0] = nsecs;
+}
+usdt:./server:myserver:request_end {
+    $latency = (nsecs - @start[arg0]) / 1000000;
+    @latency_ms = hist($latency);
+    delete(@start[arg0]);
+}
+END {
+    print(@latency_ms);
+}
+'
+```
+
+### Track errors
+```bash
+sudo bpftrace -e '
+usdt:./server:myserver:request_error {
+    printf("ERROR: request %d failed with code %d\n", arg0, arg1);
+    @errors = count();
+}
+'
+```
+
+## Exercise 4: Trace with perf
+
+```bash
+# Add probe
+sudo perf probe -x ./server 'myserver:request_start'
+
+# Record
+sudo perf record -e 'probe_server:*' ./server 3 20
+
+# Report
+sudo perf report
+```
+
+## How USDT Probes Work
+
+The `DTRACE_PROBE` macro inserts a NOP instruction:
+
+```c
+DTRACE_PROBE2(myserver, request_start, req->id, req->type);
+```
+
+Compiles to something like:
+```asm
+nop    ; placeholder for probe
+```
+
+When you activate tracing:
+1. Tracer finds the probe location (stored in ELF notes)
+2. Replaces NOP with a trap instruction (INT3 on x86)
+3. Trap triggers, tracer runs, returns control
+4. When tracing stops, NOP is restored
+
+**Overhead when not tracing**: ~0 (just a NOP)
+**Overhead when tracing**: trap + handler execution
+
+## Real-World Uses
+
+### Python has USDT probes:
+```bash
+# If Python was built with --enable-dtrace
+sudo bpftrace -e 'usdt:/usr/bin/python3:python:function__entry { printf("%s\n", str(arg1)); }'
+```
+
+### MySQL probes for query tracking:
+```bash
+sudo bpftrace -e 'usdt:/usr/sbin/mysqld:mysql:query__start { printf("%s\n", str(arg0)); }'
+```
+
+## Discussion Questions
+
+1. **When would you use USDT vs dynamic probes?**
+   - USDT: Known important points, stable interface
+   - Dynamic: Ad-hoc debugging, no source changes
+
+2. **What's the trade-off of adding probes?**
+   - Pro: Always available for debugging
+   - Con: Must plan ahead, adds code complexity
+
+3. **Why not just use printf debugging?**
+   - Printf has overhead even when you don't need it
+   - USDT has zero overhead until activated
+   - USDT can be traced without rebuilding
+
+## Advanced: Creating Custom Probes
+
+The probe macros from `<sys/sdt.h>`:
+
+```c
+DTRACE_PROBE(provider, name)                    // No arguments
+DTRACE_PROBE1(provider, name, arg1)             // 1 argument
+DTRACE_PROBE2(provider, name, arg1, arg2)       // 2 arguments
+// ... up to DTRACE_PROBE12
+```
+
+Arguments can be integers or pointers. Strings need special handling.
+
+## Key Takeaways
+
+1. **USDT probes are designed-in observability**
+2. **Zero overhead when not actively tracing**
+3. **bpftrace makes probe usage easy**
+4. **Many production systems already have probes (Python, databases, etc.)**
+
+This is an advanced topic - the main takeaway for beginners is that
+such instrumentation exists and enables powerful production debugging.
diff --git a/scenario6-usdt-probes/server.c b/scenario6-usdt-probes/server.c
new file mode 100644
index 0000000..2d56980
--- /dev/null
+++ b/scenario6-usdt-probes/server.c
@@ -0,0 +1,141 @@
+/*
+ * Scenario 6: USDT Probes - Custom Instrumentation
+ * =================================================
+ * This program demonstrates User Statically Defined Tracepoints (USDT).
+ * USDT probes allow you to add custom tracing points to your code
+ * that have near-zero overhead when not actively traced.
+ *
+ * Compile:
+ *   gcc -O2 -g -o server server.c
+ *
+ * The probes use sys/sdt.h which is provided by systemtap-sdt-dev on Ubuntu.
+ * Install: sudo apt install systemtap-sdt-dev
+ *
+ * EXERCISES:
+ * 1. Run normally: ./server
+ * 2. List probes: perf probe -x ./server --list
+ * 3. Trace probes: See README.md for bpftrace examples
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <string.h>
+
+/* 
+ * USDT probe macros from systemtap-sdt-dev
+ * If not available, we define dummy macros
+ */
+#ifdef HAVE_SDT
+#include <sys/sdt.h>
+#else
+/* Dummy macros when SDT isn't available */
+#define DTRACE_PROBE(provider, name)
+#define DTRACE_PROBE1(provider, name, arg1)
+#define DTRACE_PROBE2(provider, name, arg1, arg2)
+#define DTRACE_PROBE3(provider, name, arg1, arg2, arg3)
+#endif
+
+/* Simulated request structure */
+struct request {
+    int id;
+    int type;       /* 0=read, 1=write, 2=delete */
+    int size;       /* payload size */
+    char data[64];
+};
+
+/* Statistics */
+static unsigned long total_requests = 0;
+static unsigned long total_errors = 0;
+
+/*
+ * Process a single request
+ * We add USDT probes at entry, exit, and error points
+ */
+int process_request(struct request *req) {
+    /* Probe: request processing started */
+    DTRACE_PROBE2(myserver, request_start, req->id, req->type);
+    
+    total_requests++;
+    
+    /* Simulate variable processing time based on request type */
+    int delay_ms;
+    switch (req->type) {
+        case 0: delay_ms = 10 + (rand() % 20); break;   /* read: 10-30ms */
+        case 1: delay_ms = 50 + (rand() % 50); break;   /* write: 50-100ms */
+        case 2: delay_ms = 5 + (rand() % 10); break;    /* delete: 5-15ms */
+        default: delay_ms = 100;
+    }
+    
+    /* Simulate some work */
+    usleep(delay_ms * 1000);
+    
+    /* Simulate occasional errors (10% chance) */
+    if (rand() % 10 == 0) {
+        DTRACE_PROBE2(myserver, request_error, req->id, -1);
+        total_errors++;
+        return -1;
+    }
+    
+    /* Probe: request completed successfully */
+    DTRACE_PROBE3(myserver, request_end, req->id, req->type, delay_ms);
+    
+    return 0;
+}
+
+/*
+ * Batch processing function
+ */
+void process_batch(int batch_id, int count) {
+    DTRACE_PROBE2(myserver, batch_start, batch_id, count);
+    
+    printf("Processing batch %d with %d requests...\n", batch_id, count);
+    
+    struct request req;
+    for (int i = 0; i < count; i++) {
+        req.id = batch_id * 1000 + i;
+        req.type = rand() % 3;
+        req.size = 64;
+        snprintf(req.data, sizeof(req.data), "request_%d", req.id);
+        
+        process_request(&req);
+    }
+    
+    DTRACE_PROBE1(myserver, batch_end, batch_id);
+}
+
+int main(int argc, char *argv[]) {
+    int num_batches = 5;
+    int requests_per_batch = 20;
+    
+    if (argc > 1) num_batches = atoi(argv[1]);
+    if (argc > 2) requests_per_batch = atoi(argv[2]);
+    
+    srand(time(NULL));
+    
+    printf("USDT Probe Demo Server\n");
+    printf("======================\n");
+    printf("Batches: %d, Requests per batch: %d\n\n", num_batches, requests_per_batch);
+    
+#ifndef HAVE_SDT
+    printf("Note: Compiled without USDT support.\n");
+    printf("To enable probes: sudo apt install systemtap-sdt-dev\n");
+    printf("Then compile with: gcc -DHAVE_SDT -O2 -g -o server server.c\n\n");
+#endif
+    
+    DTRACE_PROBE(myserver, server_start);
+    
+    for (int i = 0; i < num_batches; i++) {
+        process_batch(i, requests_per_batch);
+    }
+    
+    DTRACE_PROBE(myserver, server_stop);
+    
+    printf("\n=== Summary ===\n");
+    printf("Total requests: %lu\n", total_requests);
+    printf("Total errors:   %lu (%.1f%%)\n", 
+           total_errors, 100.0 * total_errors / total_requests);
+    
+    return 0;
+}
diff --git a/scenario7-pyroscope/README.md b/scenario7-pyroscope/README.md
new file mode 100644
index 0000000..5666938
--- /dev/null
+++ b/scenario7-pyroscope/README.md
@@ -0,0 +1,195 @@
+# Scenario 7: Continuous Profiling with Pyroscope
+
+## Learning Objectives
+- Understand the difference between one-shot and continuous profiling
+- Set up and use Pyroscope for Python applications
+- Navigate the Pyroscope UI to find performance issues
+- Compare flamegraphs over time
+
+## Background
+
+**One-shot profiling** (what we've done so far):
+- Run profiler → Execute program → Stop → Analyze
+- Good for: reproducible tests, specific scenarios
+- Bad for: intermittent issues, production systems
+
+**Continuous profiling**:
+- Always running in the background
+- Low overhead (~2-5% CPU)
+- Aggregates data over time
+- Good for: production monitoring, finding intermittent issues
+
+## Files
+- `app.py` - Flask web application with Pyroscope instrumentation
+- `loadgen.sh` - Script to generate traffic
+- `requirements.txt` - Python dependencies
+
+## Setup
+
+### 1. Start Pyroscope Server
+
+Option A: Docker (recommended)
+```bash
+docker run -d --name pyroscope -p 4040:4040 grafana/pyroscope
+```
+
+Option B: Binary download
+```bash
+# Download from https://github.com/grafana/pyroscope/releases
+./pyroscope server
+```
+
+### 2. Install Python Dependencies
+```bash
+pip install -r requirements.txt
+# Or: pip install flask pyroscope-io
+```
+
+### 3. Start the Application
+```bash
+python3 app.py
+```
+
+### 4. Generate Load
+```bash
+chmod +x loadgen.sh
+./loadgen.sh http://localhost:5000 120  # 2 minutes of load
+```
+
+### 5. View Profiles
+Open http://localhost:4040 in your browser.
+
+## Exercise 1: Explore the Pyroscope UI
+
+1. Go to http://localhost:4040
+2. Select `workshop.flask.app` from the application dropdown
+3. Observe the flamegraph
+
+### UI Navigation
+- **Timeline**: Shows CPU usage over time, click to select time range
+- **Flamegraph**: Visual representation of where time is spent
+- **Table view**: Sortable list of functions by self/total time
+- **Diff view**: Compare two time ranges
+
+## Exercise 2: Find the Hot Function
+
+While `loadgen.sh` is running:
+
+1. Look at the flamegraph
+2. Find `compute_primes_slow` - it should be prominent
+3. Click on it to zoom in
+4. See the call stack leading to it
+
+## Exercise 3: Compare Cached vs Uncached
+
+1. Note the current time
+2. Stop `loadgen.sh`
+3. Modify `loadgen.sh` to only hit cached endpoints (or run manually):
+   ```bash
+   for i in $(seq 100); do
+       curl -s "localhost:5000/api/hash_cached/test_$((i % 5))"
+   done
+   ```
+4. In Pyroscope, compare the two time periods using the diff view
+
+## Exercise 4: Spot I/O-Bound Code
+
+1. Generate load to the slow_io endpoint:
+   ```bash
+   for i in $(seq 50); do curl -s localhost:5000/api/slow_io; done
+   ```
+2. Look at the flamegraph
+3. Notice that `time.sleep` doesn't show up much - why?
+   - CPU profiling only captures CPU time
+   - I/O wait (sleeping, network, disk) doesn't consume CPU
+   - This is why I/O-bound code looks "fast" in CPU profiles!
+
+## Exercise 5: Timeline Analysis
+
+1. Let `loadgen.sh` run for several minutes
+2. In Pyroscope, zoom out the timeline
+3. Look for patterns:
+   - Spikes in CPU usage
+   - Changes in the flamegraph shape over time
+4. Select different time ranges to compare
+
+## Key Pyroscope Concepts
+
+### Flamegraph Reading
+- **Width** = proportion of total samples (time)
+- **Height** = call stack depth
+- **Color** = usually arbitrary (for differentiation)
+- **Plateaus** = functions that are "hot"
+
+### Comparing Profiles
+Pyroscope can show:
+- **Diff view**: Red = more time, Green = less time
+- Useful for before/after comparisons
+
+### Tags
+The app uses tags for filtering:
+```python
+pyroscope.configure(
+    tags={"env": "workshop", "version": "1.0.0"}
+)
+```
+
+You can filter by tags in the UI.
+
+## Production Considerations
+
+### Overhead
+- Pyroscope Python agent: ~2-5% CPU overhead
+- Sampling rate can be tuned (default: 100Hz)
+
+### Data Volume
+- Profiles are aggregated, not stored raw
+- Storage is efficient (10-100MB per day per app)
+
+### Security
+- Profile data can reveal code structure
+- Consider who has access to Pyroscope
+
+### Alternatives
+- **Datadog Continuous Profiler**
+- **AWS CodeGuru Profiler**
+- **Google Cloud Profiler**
+- **Parca** (open source, eBPF-based)
+
+## Troubleshooting
+
+### "No data in Pyroscope"
+- Check if Pyroscope server is running: http://localhost:4040
+- Check app logs for connection errors
+- Verify `pyroscope-io` is installed
+
+### "Profile looks empty"
+- Generate more load
+- The endpoint might be I/O bound (not CPU)
+- Check the time range in the UI
+
+### High overhead
+- Reduce sampling rate in pyroscope.configure()
+- Check for profiling-related exceptions
+
+## Discussion Questions
+
+1. **When would you use continuous profiling vs one-shot?**
+   - Continuous: production, long-running apps, intermittent issues
+   - One-shot: development, benchmarking, specific scenarios
+
+2. **What can't CPU profiling show you?**
+   - I/O wait time
+   - Lock contention (mostly)
+   - Memory allocation patterns
+
+3. **How would you profile a batch job vs a web server?**
+   - Batch: one-shot profiling of the entire run
+   - Server: continuous, focus on request handling paths
+
+## Key Takeaways
+
+1. **Continuous profiling catches issues that one-shot misses**
+2. **Low overhead makes it safe for production**
+3. **Timeline view reveals patterns over time**
+4. **CPU profiling doesn't show I/O time**
diff --git a/scenario7-pyroscope/app.py b/scenario7-pyroscope/app.py
new file mode 100644
index 0000000..6b6995b
--- /dev/null
+++ b/scenario7-pyroscope/app.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+"""
+Scenario 7: Continuous Profiling with Pyroscope
+===============================================
+A simple Flask web app instrumented with Pyroscope for continuous profiling.
+
+SETUP:
+1. Start Pyroscope: docker run -p 4040:4040 grafana/pyroscope
+2. Install deps: pip install flask pyroscope-io
+3. Run this app: python3 app.py
+4. Generate load: ./loadgen.sh (or curl in a loop)
+5. View profiles: http://localhost:4040
+
+The app has intentionally slow endpoints to demonstrate profiling.
+"""
+
+import os
+import time
+import math
+import hashlib
+from functools import lru_cache
+
+# Try to import pyroscope, gracefully handle if not installed
+try:
+    import pyroscope
+    PYROSCOPE_AVAILABLE = True
+except ImportError:
+    PYROSCOPE_AVAILABLE = False
+    print("Pyroscope not installed. Run: pip install pyroscope-io")
+    print("Continuing without profiling...\n")
+
+from flask import Flask, jsonify
+
+app = Flask(__name__)
+
+# Configure Pyroscope
+if PYROSCOPE_AVAILABLE:
+    pyroscope.configure(
+        application_name="workshop.flask.app",
+        server_address="http://localhost:4040",
+        # Enable profiling for specific aspects
+        tags={
+            "env": "workshop",
+            "version": "1.0.0",
+        }
+    )
+
+
+# ============================================================
+# Endpoint 1: CPU-intensive computation
+# ============================================================
+
+def compute_primes_slow(n):
+    """Intentionally slow prime computation."""
+    primes = []
+    for num in range(2, n):
+        is_prime = True
+        for i in range(2, int(math.sqrt(num)) + 1):
+            if num % i == 0:
+                is_prime = False
+                break
+        if is_prime:
+            primes.append(num)
+    return primes
+
+
+@app.route('/api/primes/<int:n>')
+def primes_endpoint(n):
+    """CPU-bound endpoint - compute primes up to n."""
+    n = min(n, 50000)  # Limit to prevent DoS
+    start = time.time()
+    primes = compute_primes_slow(n)
+    elapsed = time.time() - start
+    return jsonify({
+        'count': len(primes),
+        'limit': n,
+        'elapsed_ms': round(elapsed * 1000, 2)
+    })
+
+
+# ============================================================
+# Endpoint 2: Repeated expensive computation (needs caching)
+# ============================================================
+
+def expensive_hash(data, iterations=1000):
+    """Simulate expensive computation."""
+    result = data.encode()
+    for _ in range(iterations):
+        result = hashlib.sha256(result).digest()
+    return result.hex()
+
+
+@app.route('/api/hash/<data>')
+def hash_endpoint(data):
+    """
+    This endpoint recomputes the hash every time.
+    Profile will show expensive_hash taking lots of time.
+    See hash_cached endpoint for improvement.
+    """
+    start = time.time()
+    result = expensive_hash(data)
+    elapsed = time.time() - start
+    return jsonify({
+        'input': data,
+        'hash': result[:16] + '...',
+        'elapsed_ms': round(elapsed * 1000, 2)
+    })
+
+
+@lru_cache(maxsize=1000)
+def expensive_hash_cached(data, iterations=1000):
+    """Cached version of expensive_hash."""
+    result = data.encode()
+    for _ in range(iterations):
+        result = hashlib.sha256(result).digest()
+    return result.hex()
+
+
+@app.route('/api/hash_cached/<data>')
+def hash_cached_endpoint(data):
+    """Cached version - compare profile with /api/hash."""
+    start = time.time()
+    result = expensive_hash_cached(data)
+    elapsed = time.time() - start
+    return jsonify({
+        'input': data,
+        'hash': result[:16] + '...',
+        'elapsed_ms': round(elapsed * 1000, 2),
+        'cache_info': str(expensive_hash_cached.cache_info())
+    })
+
+
+# ============================================================
+# Endpoint 3: I/O simulation
+# ============================================================
+
+@app.route('/api/slow_io')
+def slow_io_endpoint():
+    """
+    Simulate slow I/O (database query, external API, etc.)
+    This won't show much in CPU profiles - it's I/O bound!
+    """
+    time.sleep(0.1)  # Simulate 100ms I/O
+    return jsonify({'status': 'ok', 'simulated_io_ms': 100})
+
+
+# ============================================================
+# Endpoint 4: Mix of work types
+# ============================================================
+
+@app.route('/api/mixed/<int:n>')
+def mixed_endpoint(n):
+    """Mixed workload: some CPU, some I/O."""
+    n = min(n, 1000)
+    
+    # CPU work
+    total = 0
+    for i in range(n * 100):
+        total += math.sin(i) * math.cos(i)
+    
+    # Simulated I/O
+    time.sleep(0.01)
+    
+    # More CPU work
+    data = str(total).encode()
+    for _ in range(100):
+        data = hashlib.md5(data).digest()
+    
+    return jsonify({
+        'n': n,
+        'result': data.hex()[:16]
+    })
+
+
+# ============================================================
+# Health check
+# ============================================================
+
+@app.route('/health')
+def health():
+    return jsonify({'status': 'healthy', 'pyroscope': PYROSCOPE_AVAILABLE})
+
+
+@app.route('/')
+def index():
+    return '''
+    <h1>Pyroscope Demo App</h1>
+    <h2>Endpoints:</h2>
+    <ul>
+        <li><a href="/api/primes/10000">/api/primes/&lt;n&gt;</a> - CPU intensive</li>
+        <li><a href="/api/hash/hello">/api/hash/&lt;data&gt;</a> - Expensive (uncached)</li>
+        <li><a href="/api/hash_cached/hello">/api/hash_cached/&lt;data&gt;</a> - Expensive (cached)</li>
+        <li><a href="/api/slow_io">/api/slow_io</a> - I/O simulation</li>
+        <li><a href="/api/mixed/100">/api/mixed/&lt;n&gt;</a> - Mixed workload</li>
+        <li><a href="/health">/health</a> - Health check</li>
+    </ul>
+    <h2>Profiling:</h2>
+    <p>View profiles at <a href="http://localhost:4040">http://localhost:4040</a></p>
+    '''
+
+
+if __name__ == '__main__':
+    print("Starting Flask app on http://localhost:5000")
+    print("Pyroscope dashboard: http://localhost:4040")
+    print("\nGenerate load with: ./loadgen.sh")
+    print("Or: for i in $(seq 100); do curl -s localhost:5000/api/primes/5000 > /dev/null; done")
+    app.run(host='0.0.0.0', port=5000, debug=False)
diff --git a/scenario7-pyroscope/loadgen.sh b/scenario7-pyroscope/loadgen.sh
new file mode 100644
index 0000000..c992bbf
--- /dev/null
+++ b/scenario7-pyroscope/loadgen.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+#
+# Load generator for Pyroscope demo
+# Run this to generate traffic that will show up in Pyroscope
+#
+
+BASE_URL="${1:-http://localhost:5000}"
+DURATION="${2:-60}"  # seconds
+
+echo "Generating load to $BASE_URL for $DURATION seconds"
+echo "Press Ctrl+C to stop"
+echo ""
+
+end_time=$(($(date +%s) + DURATION))
+request_count=0
+
+while [ $(date +%s) -lt $end_time ]; do
+    # Mix of different endpoints
+    case $((RANDOM % 10)) in
+        0|1|2|3)
+            # 40% - CPU intensive (primes)
+            n=$((1000 + RANDOM % 4000))
+            curl -s "$BASE_URL/api/primes/$n" > /dev/null
+            ;;
+        4|5)
+            # 20% - Hash (uncached)
+            data="data_$(($RANDOM % 100))"
+            curl -s "$BASE_URL/api/hash/$data" > /dev/null
+            ;;
+        6|7)
+            # 20% - Hash (cached)
+            data="data_$(($RANDOM % 10))"  # Smaller set for better cache hits
+            curl -s "$BASE_URL/api/hash_cached/$data" > /dev/null
+            ;;
+        8)
+            # 10% - Slow I/O
+            curl -s "$BASE_URL/api/slow_io" > /dev/null
+            ;;
+        9)
+            # 10% - Mixed
+            curl -s "$BASE_URL/api/mixed/500" > /dev/null
+            ;;
+    esac
+    
+    request_count=$((request_count + 1))
+    
+    # Print progress every 10 requests
+    if [ $((request_count % 10)) -eq 0 ]; then
+        echo -ne "\rRequests: $request_count"
+    fi
+    
+    # Small delay to avoid overwhelming
+    sleep 0.1
+done
+
+echo ""
+echo "Done! Total requests: $request_count"
+echo "Check Pyroscope at http://localhost:4040"
diff --git a/scenario7-pyroscope/requirements.txt b/scenario7-pyroscope/requirements.txt
new file mode 100644
index 0000000..1d58601
--- /dev/null
+++ b/scenario7-pyroscope/requirements.txt
@@ -0,0 +1,2 @@
+flask>=2.0.0
+pyroscope-io>=0.8.0