commit 4fb1bd90db7b53ea71fa8076ddaac9b8911b4b43 Author: illustris Date: Thu Jan 8 18:11:30 2026 +0530 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b25c15b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*~ diff --git a/README.md b/README.md new file mode 100644 index 0000000..8d720c1 --- /dev/null +++ b/README.md @@ -0,0 +1,275 @@ +# Linux Performance Engineering Workshop + +## 4-Hour Hands-On Training for BITS Pilani Goa + +### Prerequisites +- Basic C programming knowledge +- Basic Python knowledge +- Familiarity with command line +- Ubuntu 22.04/24.04 (or similar Linux) + +--- + +## Workshop Overview + +This workshop teaches practical performance engineering skills using libre tools on Linux. +By the end, you'll be able to identify and fix common performance problems. + +### What You'll Learn +- How to measure program performance (not guess!) +- CPU profiling with `perf` and flamegraphs +- Identifying syscall overhead with `strace` +- Understanding cache behavior +- Continuous profiling for production systems + +### Philosophy +> "Measure, don't guess." + +Most performance intuitions are wrong. This workshop teaches you to find bottlenecks with data. + +--- + +## Schedule + +| Time | Topic | Hands-On | +|------|-------|----------| +| 0:00-0:45 | Introduction & Theory | - | +| 0:45-1:30 | Python Profiling | Scenarios 1 & 2 | +| 1:30-1:45 | Break | - | +| 1:45-2:30 | perf & Flamegraphs | Theory + Demo | +| 2:30-3:30 | Cache & Debug Symbols | Scenarios 4 & 5 | +| 3:30-4:00 | Lunch Break | - | +| 4:00-4:30 | Syscalls & I/O | Theory | +| 4:30-5:15 | Syscall Profiling | Scenario 3 | +| 5:15-5:30 | Break | - | +| 5:30-6:00 | Advanced Topics & Wrap-up | Scenarios 6 & 7 | + +--- + +## Setup Instructions + +### Install Required Packages + +```bash +# Core tools +sudo apt update +sudo apt install -y \ + build-essential \ + linux-tools-common \ + linux-tools-$(uname -r) \ + strace \ + ltrace \ + htop \ + python3-pip + +# Optional but recommended +sudo apt install -y \ + hyperfine \ + valgrind \ + systemtap-sdt-dev + +# Python tools +pip3 install py-spy + +# Pyroscope (for scenario 7) +# Option A: Docker +docker pull grafana/pyroscope +# Option B: Download binary from https://github.com/grafana/pyroscope/releases + +# FlameGraph scripts +cd ~ +git clone https://github.com/brendangregg/FlameGraph.git +``` + +### Configure perf Permissions + +```bash +# Allow perf for non-root users (needed for this workshop) +sudo sysctl -w kernel.perf_event_paranoid=1 + +# To make permanent: +echo 'kernel.perf_event_paranoid=1' | sudo tee -a /etc/sysctl.conf +``` + +### Verify Installation + +```bash +# Should all work without errors: +perf --version +strace --version +py-spy --version +gcc --version +python3 --version +``` + +--- + +## Directory Structure + +``` +perf-workshop/ +├── README.md # This file +├── common/ +│ └── CHEATSHEET.md # Quick reference card +├── scenario1-python-to-c/ +│ ├── README.md +│ ├── prime_slow.py # Slow Python version +│ ├── prime.c # C implementation +│ └── prime_fast.py # Python + C via ctypes +├── scenario2-memoization/ +│ ├── README.md +│ ├── fib_slow.py # Naive recursive Fibonacci +│ ├── fib_cached.py # Memoized Fibonacci +│ └── config_validator.py # Precomputation example +├── scenario3-syscall-storm/ +│ ├── README.md +│ ├── Makefile +│ ├── read_slow.c # Byte-by-byte reads +│ ├── read_fast.c # Buffered reads +│ ├── read_stdio.c # stdio buffering +│ └── read_python.py # Python equivalent +├── scenario4-cache-misses/ +│ ├── README.md +│ ├── Makefile +│ ├── cache_demo.c # Row vs column major +│ └── list_vs_array.c # Array vs linked list +├── scenario5-debug-symbols/ +│ ├── README.md +│ ├── Makefile +│ └── program.c # Multi-function program +├── scenario6-usdt-probes/ +│ ├── README.md +│ ├── Makefile +│ └── server.c # Program with USDT probes +└── scenario7-pyroscope/ + ├── README.md + ├── requirements.txt + ├── app.py # Flask app with Pyroscope + └── loadgen.sh # Load generator script +``` + +--- + +## Quick Start + +### Build Everything + +```bash +# Build all C programs +for dir in scenario{3,4,5,6}*/; do + if [ -f "$dir/Makefile" ]; then + echo "Building $dir" + make -C "$dir" + fi +done + +# Build scenario 1 C library +cd scenario1-python-to-c +gcc -O2 -fPIC -shared -o libprime.so prime.c +cd .. +``` + +### Run a Scenario + +Each scenario has its own README with step-by-step instructions. +Start with: + +```bash +cd scenario1-python-to-c +cat README.md +``` + +--- + +## Key Concepts Summary + +### 1. Types of Bottlenecks + +| Type | Symptom | Tool | +|------|---------|------| +| CPU-bound | `user` time is high | `perf record` | +| Syscall-bound | `sys` time is high | `strace -c` | +| I/O-bound | Low CPU, slow wall time | `strace`, `iostat` | +| Memory-bound | High cache misses | `perf stat` | + +### 2. Profiling Workflow + +``` +1. Measure: time ./program +2. Hypothesize: Where is time spent? +3. Profile: perf/strace/cProfile +4. Analyze: Find hot spots +5. Optimize: Fix the bottleneck +6. Verify: Re-measure +``` + +### 3. Tool Selection + +| Task | Tool | +|------|------| +| Basic timing | `time` | +| CPU sampling | `perf record` | +| Hardware counters | `perf stat` | +| Syscall tracing | `strace -c` | +| Python profiling | `cProfile`, `py-spy` | +| Visualization | Flamegraphs | +| Continuous profiling | Pyroscope | + +--- + +## Further Learning + +### Books +- "Systems Performance" by Brendan Gregg +- "BPF Performance Tools" by Brendan Gregg + +### Online Resources +- https://www.brendangregg.com/linuxperf.html +- https://perf.wiki.kernel.org/ +- https://jvns.ca/blog/2016/03/12/how-does-perf-work-and-some-questions/ + +### Tools to Explore Later +- `bpftrace` - High-level tracing language +- `eBPF` - In-kernel programmability +- `Valgrind` - Memory profiling +- `gprof` - Traditional profiler + +--- + +## Troubleshooting + +### "perf: command not found" +```bash +sudo apt install linux-tools-common linux-tools-$(uname -r) +``` + +### "Access to performance monitoring operations is limited" +```bash +sudo sysctl -w kernel.perf_event_paranoid=1 +``` + +### "py-spy: Permission denied" +Either run as root or use `--nonblocking`: +```bash +sudo py-spy record -o profile.svg -- python3 script.py +# Or: +py-spy record --nonblocking -o profile.svg -- python3 script.py +``` + +### "No debug symbols" +Recompile with `-g`: +```bash +gcc -O2 -g -o program program.c +``` + +--- + +## Feedback + +Found an issue? Have suggestions? +Please provide feedback to your instructor! + +--- + +*Workshop materials prepared for BITS Pilani Goa* +*Tools: All libre/open-source software* diff --git a/common/CHEATSHEET.md b/common/CHEATSHEET.md new file mode 100644 index 0000000..0b637c3 --- /dev/null +++ b/common/CHEATSHEET.md @@ -0,0 +1,256 @@ +# Linux Performance Tools Cheatsheet + +## Quick Reference Card for Workshop + +--- + +## time - Basic Timing + +```bash +time ./program # Wall clock, user, sys time +time -v ./program # Verbose (GNU time, may need /usr/bin/time -v) +``` + +**Output explained:** +- `real` - Wall clock time (what a stopwatch would show) +- `user` - CPU time in user space (your code) +- `sys` - CPU time in kernel space (syscalls) + +--- + +## perf stat - Hardware Counters + +```bash +# Basic stats +perf stat ./program + +# Specific events +perf stat -e cycles,instructions,cache-misses ./program + +# Repeat for statistical accuracy +perf stat -r 5 ./program + +# Common events +perf stat -e cycles,instructions,cache-references,cache-misses,branches,branch-misses ./program +``` + +**Key metrics:** +- **IPC** (Instructions Per Cycle): Higher is better, >1 is good +- **Cache miss ratio**: Lower is better +- **Branch misses**: Lower is better + +--- + +## perf record/report - CPU Sampling + +```bash +# Record samples +perf record ./program +perf record -g ./program # With call graphs +perf record -F 99 ./program # Custom frequency (99 Hz) + +# Analyze +perf report # Interactive TUI +perf report --stdio # Text output +perf report -n --stdio # With sample counts +``` + +**TUI navigation:** +- Arrow keys: Navigate +- Enter: Zoom into function +- `a`: Annotate (show source/assembly) +- `q`: Quit/back + +--- + +## perf annotate - Source-Level View + +```bash +# After perf record +perf annotate function_name # Show hot lines +perf annotate -s function_name # Source view (needs -g) +``` + +**Requires:** Compiled with `-g` for source view + +--- + +## strace - Syscall Tracing + +```bash +# Summary of syscalls +strace -c ./program + +# Trace specific syscalls +strace -e read,write ./program + +# With timing per call +strace -T ./program + +# Follow forks +strace -f ./program + +# Output to file +strace -o trace.log ./program +``` + +**Key columns in `-c` output:** +- `% time`: Percentage of total syscall time +- `calls`: Number of times called +- `errors`: Failed calls + +--- + +## Flamegraphs + +```bash +# Clone FlameGraph repo (one time) +git clone https://github.com/brendangregg/FlameGraph.git + +# Generate flamegraph +perf record -g ./program +perf script | ./FlameGraph/stackcollapse-perf.pl | ./FlameGraph/flamegraph.pl > profile.svg + +# Open in browser +firefox profile.svg +``` + +--- + +## Python Profiling + +```bash +# cProfile - built-in profiler +python3 -m cProfile script.py +python3 -m cProfile -s cumtime script.py # Sort by cumulative time +python3 -m cProfile -s ncalls script.py # Sort by call count +python3 -m cProfile -o profile.stats script.py # Save for later analysis + +# py-spy - sampling profiler (low overhead) +pip install py-spy +py-spy record -o profile.svg -- python3 script.py # Flamegraph +py-spy top -- python3 script.py # Live view + +# Attach to running process +py-spy top --pid 12345 +``` + +--- + +## /proc Filesystem + +```bash +# Process info +cat /proc//status # Process status +cat /proc//maps # Memory mappings +cat /proc//fd # Open file descriptors +cat /proc//smaps # Detailed memory info + +# System info +cat /proc/cpuinfo # CPU details +cat /proc/meminfo # Memory details +cat /proc/loadavg # Load average +``` + +--- + +## htop - System Overview + +```bash +htop # Interactive process viewer +``` + +**Key shortcuts:** +- `F6`: Sort by column +- `F5`: Tree view +- `F9`: Kill process +- `t`: Toggle tree view +- `H`: Toggle user threads + +--- + +## hyperfine - Benchmarking + +```bash +# Basic benchmark +hyperfine './program' + +# Compare two programs +hyperfine './program_v1' './program_v2' + +# With warmup +hyperfine --warmup 3 './program' + +# Export results +hyperfine --export-markdown results.md './program' +``` + +--- + +## Quick Diagnosis Flowchart + +``` +Program is slow + │ + ▼ +┌──────────────────┐ +│ time ./program │ +└────────┬─────────┘ + │ + ▼ + ┌─────────────────────────────────┐ + │ Is 'sys' time high? │ + └─────────┬───────────┬───────────┘ + YES NO + │ │ + ▼ ▼ + ┌──────────┐ ┌──────────────┐ + │ strace -c │ │ perf record │ + │ (syscalls)│ │ (CPU profile)│ + └──────────┘ └──────────────┘ +``` + +--- + +## Permission Issues + +```bash +# Allow perf for non-root (temporary) +sudo sysctl -w kernel.perf_event_paranoid=1 + +# Allow perf for non-root (permanent) +echo 'kernel.perf_event_paranoid=1' | sudo tee -a /etc/sysctl.conf + +# Run perf as root if needed +sudo perf record ./program +``` + +--- + +## Useful One-Liners + +```bash +# Top 10 functions by CPU time +perf report -n --stdio | head -20 + +# Count syscalls by type +strace -c ./program 2>&1 | tail -20 + +# Watch cache misses in real-time +perf stat -e cache-misses -I 1000 -p + +# Find memory-mapped files for a process +cat /proc//maps | grep -v '\[' | awk '{print $6}' | sort -u + +# Monitor a Python process +py-spy top --pid $(pgrep -f 'python.*myapp') +``` + +--- + +## Resources + +- **Brendan Gregg's site**: https://www.brendangregg.com/linuxperf.html +- **perf wiki**: https://perf.wiki.kernel.org/ +- **FlameGraph repo**: https://github.com/brendangregg/FlameGraph +- **py-spy docs**: https://github.com/benfred/py-spy diff --git a/scenario1-python-to-c/README.md b/scenario1-python-to-c/README.md new file mode 100644 index 0000000..4f8dc14 --- /dev/null +++ b/scenario1-python-to-c/README.md @@ -0,0 +1,55 @@ +# Scenario 1: Python to C Optimization + +## Learning Objectives +- Use `time` to measure execution time +- Profile Python code with `cProfile` +- Generate flamegraphs with `py-spy` +- Use `ctypes` to call C code from Python + +## Files +- `prime_slow.py` - Pure Python implementation (slow) +- `prime.c` - C implementation of the hot function +- `prime_fast.py` - Python calling C via ctypes + +## Exercises + +### Step 1: Measure the baseline +```bash +time python3 prime_slow.py 100000 +``` + +Note the `real` time (wall clock) and `user` time (CPU time in user space). + +### Step 2: Profile with cProfile +```bash +python3 -m cProfile -s cumtime prime_slow.py 100000 +``` + +Look for: +- Which function has the highest `cumtime` (cumulative time)? +- How many times (`ncalls`) is `is_prime` called? + +### Step 3: Generate a flamegraph +```bash +py-spy record -o prime_slow.svg -- python3 prime_slow.py +``` + +Open `prime_slow.svg` in a browser. The widest bar at the top shows where time is spent. + +### Step 4: Compile and run the optimized version +```bash +# Compile the C library +gcc -O2 -fPIC -shared -o libprime.so prime.c + +# Run the fast version +time python3 prime_fast.py 100000 +``` + +### Step 5: Compare +- How much faster is the C version? +- Generate a flamegraph for `prime_fast.py` - what's different? + +## Discussion Questions +1. Why is the C version faster? (Hint: interpreter overhead, type checking) +2. When is it worth rewriting in C vs. finding a library? +3. What's the trade-off of using `ctypes` vs native Python? diff --git a/scenario1-python-to-c/prime.c b/scenario1-python-to-c/prime.c new file mode 100644 index 0000000..44f0233 --- /dev/null +++ b/scenario1-python-to-c/prime.c @@ -0,0 +1,36 @@ +/* + * Scenario 1: C implementation of is_prime + * ========================================= + * Compile as shared library: + * gcc -O2 -fPIC -shared -o libprime.so prime.c + * + * Or with debug symbols for profiling: + * gcc -O2 -g -fPIC -shared -o libprime.so prime.c + */ + +#include + +/* Check if n is prime using trial division */ +int is_prime(int64_t n) { + if (n < 2) return 0; + if (n == 2) return 1; + if (n % 2 == 0) return 0; + + int64_t i = 3; + while (i * i <= n) { + if (n % i == 0) return 0; + i += 2; + } + return 1; +} + +/* Count primes up to limit - can also be called from Python */ +int64_t count_primes(int64_t limit) { + int64_t count = 0; + for (int64_t n = 2; n <= limit; n++) { + if (is_prime(n)) { + count++; + } + } + return count; +} diff --git a/scenario1-python-to-c/prime_fast.py b/scenario1-python-to-c/prime_fast.py new file mode 100644 index 0000000..cccc9c9 --- /dev/null +++ b/scenario1-python-to-c/prime_fast.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +""" +Scenario 1: The Optimized Version +================================= +This version calls the C implementation via ctypes. + +SETUP: + gcc -O2 -fPIC -shared -o libprime.so prime.c + +EXERCISES: +1. Compare: time python3 prime_fast.py +2. Profile: py-spy record -o prime_fast.svg -- python3 prime_fast.py +3. Compare the flamegraphs - what changed? +""" + +import ctypes +import sys +import os + +# Load the shared library +script_dir = os.path.dirname(os.path.abspath(__file__)) +lib_path = os.path.join(script_dir, "libprime.so") + +try: + libprime = ctypes.CDLL(lib_path) +except OSError as e: + print(f"Error: Could not load {lib_path}") + print("Please compile first: gcc -O2 -fPIC -shared -o libprime.so prime.c") + sys.exit(1) + +# Define function signatures +libprime.is_prime.argtypes = [ctypes.c_int64] +libprime.is_prime.restype = ctypes.c_int + +libprime.count_primes.argtypes = [ctypes.c_int64] +libprime.count_primes.restype = ctypes.c_int64 + + +def is_prime(n): + """Python wrapper for C is_prime.""" + return bool(libprime.is_prime(n)) + + +def count_primes(limit): + """Python wrapper for C count_primes.""" + return libprime.count_primes(limit) + + +def main(): + limit = 1_000_000 + if len(sys.argv) > 1: + limit = int(sys.argv[1]) + + print(f"Counting primes up to {limit} (using C library)...") + result = count_primes(limit) + print(f"Found {result} primes") + + +if __name__ == "__main__": + main() diff --git a/scenario1-python-to-c/prime_slow.py b/scenario1-python-to-c/prime_slow.py new file mode 100644 index 0000000..14866ef --- /dev/null +++ b/scenario1-python-to-c/prime_slow.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +""" +Scenario 1: The Obvious CPU Hog +=============================== +This program counts prime numbers using trial division. +It's intentionally slow to demonstrate profiling. + +EXERCISES: +1. Run with: time python3 prime_slow.py +2. Profile with: python3 -m cProfile -s cumtime prime_slow.py +3. Generate flamegraph: py-spy record -o prime_slow.svg -- python3 prime_slow.py +4. Identify the hot function, then see prime_fast.py for the optimized version +""" + +import sys + + +def is_prime(n): + """Check if n is prime using trial division.""" + if n < 2: + return False + if n == 2: + return True + if n % 2 == 0: + return False + # Check odd divisors up to sqrt(n) + i = 3 + while i * i <= n: + if n % i == 0: + return False + i += 2 + return True + + +def count_primes(limit): + """Count all primes up to limit.""" + count = 0 + for n in range(2, limit + 1): + if is_prime(n): + count += 1 + return count + + +def main(): + limit = 1_000_000 # Adjust this to change runtime + if len(sys.argv) > 1: + limit = int(sys.argv[1]) + + print(f"Counting primes up to {limit}...") + result = count_primes(limit) + print(f"Found {result} primes") + + +if __name__ == "__main__": + main() diff --git a/scenario2-memoization/README.md b/scenario2-memoization/README.md new file mode 100644 index 0000000..2b2cfb5 --- /dev/null +++ b/scenario2-memoization/README.md @@ -0,0 +1,96 @@ +# Scenario 2: Memoization and Precomputation + +## Learning Objectives +- Read cProfile output to identify redundant function calls +- Use `@functools.lru_cache` for automatic memoization +- Recognize when precomputation beats memoization +- Understand space-time trade-offs + +## Files +- `fib_slow.py` - Naive recursive Fibonacci (exponential time) +- `fib_cached.py` - Memoized Fibonacci (linear time) +- `config_validator.py` - Comparison of naive, memoized, and precomputed approaches + +## Exercise 1: Fibonacci + +### Step 1: Experience the slowness +```bash +time python3 fib_slow.py 35 +``` + +This should take several seconds. Don't try n=50! + +### Step 2: Profile to understand why +```bash +python3 -m cProfile -s ncalls fib_slow.py 35 2>&1 | head -20 +``` + +Key insight: Look at `ncalls` for the `fib` function. For fib(35), it's called +millions of times because we recompute the same values repeatedly. + +The call tree looks like: +``` +fib(5) +├── fib(4) +│ ├── fib(3) +│ │ ├── fib(2) +│ │ └── fib(1) +│ └── fib(2) +└── fib(3) <-- Same as above! Redundant! + ├── fib(2) + └── fib(1) +``` + +### Step 3: Apply memoization +```bash +time python3 fib_cached.py 35 +``` + +Now try a much larger value: +```bash +time python3 fib_cached.py 100 +``` + +### Step 4: Verify the improvement +```bash +python3 -m cProfile -s ncalls fib_cached.py 35 2>&1 | head -20 +``` + +The `ncalls` should now be O(n) instead of O(2^n). + +## Exercise 2: Config Validator + +This example shows when precomputation is better than memoization. + +### Run all three strategies +```bash +python3 config_validator.py 5000 +``` + +### Profile to understand the differences +```bash +python3 -m cProfile -s cumtime config_validator.py 5000 +``` + +### Discussion Questions +1. Why is precomputation faster than memoization here? + - Hint: How many unique inputs are there? + - Hint: What's the overhead of cache lookup vs dict lookup? + +2. When would memoization be better than precomputation? + - Hint: What if there were 10,000 rules and 10,000 event types? + - Hint: What if we didn't know the inputs ahead of time? + +3. What's the memory trade-off? + +## Key Takeaways + +| Approach | When to Use | +|----------|-------------| +| No caching | Function is cheap OR called once per input | +| Memoization | Unknown/large input space, function is expensive | +| Precomputation | Known/small input space, amortize cost over many lookups | + +## Further Reading +- `functools.lru_cache` documentation +- `functools.cache` (Python 3.9+) - unbounded cache, simpler API diff --git a/scenario2-memoization/config_validator.py b/scenario2-memoization/config_validator.py new file mode 100644 index 0000000..784db3c --- /dev/null +++ b/scenario2-memoization/config_validator.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +""" +Scenario 2b: The Precomputation Insight +======================================= +This simulates a config validator that checks rules against events. +The "expensive" validation function is called repeatedly with the same inputs. + +This example shows three stages of optimization: +1. Naive: call the function every time +2. Memoized: cache results with @lru_cache +3. Precomputed: since inputs are known ahead of time, build a lookup table + +EXERCISES: +1. Run each version and compare times +2. Profile each version - observe ncalls and cumtime +3. Think about: when is precomputation better than memoization? +""" + +import sys +import time +from functools import lru_cache + + +# Simulated "expensive" validation function +def validate_rule_slow(rule_id, event_type): + """ + Simulate an expensive validation check. + In real life, this might query a database, parse XML, etc. + """ + # Artificial delay to simulate expensive computation + total = 0 + for i in range(10000): + total += (rule_id * event_type * i) % 997 + return total % 2 == 0 # Returns True or False + + +# The set of all valid (rule_id, event_type) pairs we'll encounter +RULES = [1, 2, 3, 4, 5] +EVENT_TYPES = [10, 20, 30, 40, 50] + + +def process_events_naive(events): + """Process events using naive repeated validation.""" + valid_count = 0 + for rule_id, event_type, data in events: + if validate_rule_slow(rule_id, event_type): + valid_count += 1 + return valid_count + + +# Memoized version +@lru_cache(maxsize=None) +def validate_rule_cached(rule_id, event_type): + """Same validation but with caching.""" + total = 0 + for i in range(10000): + total += (rule_id * event_type * i) % 997 + return total % 2 == 0 + + +def process_events_memoized(events): + """Process events using memoized validation.""" + valid_count = 0 + for rule_id, event_type, data in events: + if validate_rule_cached(rule_id, event_type): + valid_count += 1 + return valid_count + + +# Precomputed version +def build_validation_table(): + """ + Build a lookup table for all possible (rule_id, event_type) combinations. + This is O(n*m) upfront but O(1) per lookup thereafter. + """ + table = {} + for rule_id in RULES: + for event_type in EVENT_TYPES: + table[(rule_id, event_type)] = validate_rule_slow(rule_id, event_type) + return table + + +VALIDATION_TABLE = None # Lazy initialization + + +def process_events_precomputed(events): + """Process events using precomputed lookup table.""" + global VALIDATION_TABLE + if VALIDATION_TABLE is None: + VALIDATION_TABLE = build_validation_table() + + valid_count = 0 + for rule_id, event_type, data in events: + if VALIDATION_TABLE[(rule_id, event_type)]: + valid_count += 1 + return valid_count + + +def generate_test_events(n): + """Generate n random test events.""" + import random + random.seed(42) # Reproducible + events = [] + for i in range(n): + rule_id = random.choice(RULES) + event_type = random.choice(EVENT_TYPES) + data = f"event_{i}" + events.append((rule_id, event_type, data)) + return events + + +def benchmark(name, func, events): + """Run a function and report timing.""" + start = time.perf_counter() + result = func(events) + elapsed = time.perf_counter() - start + print(f"{name:20s}: {elapsed:.3f}s (valid: {result})") + return elapsed + + +def main(): + n_events = 5000 + if len(sys.argv) > 1: + n_events = int(sys.argv[1]) + + print(f"Processing {n_events} events...") + print(f"Unique (rule, event_type) combinations: {len(RULES) * len(EVENT_TYPES)}") + print() + + events = generate_test_events(n_events) + + # Reset cached function for fair comparison + validate_rule_cached.cache_clear() + global VALIDATION_TABLE + VALIDATION_TABLE = None + + t_naive = benchmark("Naive", process_events_naive, events) + + validate_rule_cached.cache_clear() + t_memo = benchmark("Memoized", process_events_memoized, events) + + VALIDATION_TABLE = None + t_pre = benchmark("Precomputed", process_events_precomputed, events) + + print() + print(f"Speedup (memo vs naive): {t_naive/t_memo:.1f}x") + print(f"Speedup (precomp vs naive): {t_naive/t_pre:.1f}x") + print(f"Speedup (precomp vs memo): {t_memo/t_pre:.1f}x") + + +if __name__ == "__main__": + main() diff --git a/scenario2-memoization/fib_cached.py b/scenario2-memoization/fib_cached.py new file mode 100644 index 0000000..e8c2c65 --- /dev/null +++ b/scenario2-memoization/fib_cached.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +""" +Scenario 2: Memoization with functools.lru_cache +================================================ +Adding @lru_cache transforms O(2^n) into O(n) by caching results. + +EXERCISES: +1. Run: time python3 fib_cached.py 35 +2. Compare to fib_slow.py - how much faster? +3. Profile: python3 -m cProfile -s ncalls fib_cached.py 35 +4. Notice the dramatic reduction in call count +5. Try a much larger number: python3 fib_cached.py 100 +""" + +import sys +from functools import lru_cache + + +@lru_cache(maxsize=None) # Unlimited cache size +def fib(n): + """Compute the nth Fibonacci number with memoization.""" + if n <= 1: + return n + return fib(n - 1) + fib(n - 2) + + +def main(): + n = 35 + if len(sys.argv) > 1: + n = int(sys.argv[1]) + + print(f"Computing fib({n}) with memoization...") + result = fib(n) + print(f"fib({n}) = {result}") + + # Show cache statistics + print(f"\nCache info: {fib.cache_info()}") + + +if __name__ == "__main__": + main() diff --git a/scenario2-memoization/fib_slow.py b/scenario2-memoization/fib_slow.py new file mode 100644 index 0000000..0b192d3 --- /dev/null +++ b/scenario2-memoization/fib_slow.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +""" +Scenario 2: Hidden Redundancy - The Memoization Problem +======================================================== +This program computes Fibonacci numbers recursively. +The naive implementation has exponential time complexity due to redundant calls. + +EXERCISES: +1. Run: time python3 fib_slow.py 35 +2. Profile: python3 -m cProfile -s ncalls fib_slow.py 35 +3. Notice the HUGE number of calls to fib() +4. See fib_cached.py for the memoized version +""" + +import sys + + +def fib(n): + """Compute the nth Fibonacci number recursively.""" + if n <= 1: + return n + return fib(n - 1) + fib(n - 2) + + +def main(): + n = 35 # Don't go much higher without optimization! + if len(sys.argv) > 1: + n = int(sys.argv[1]) + + print(f"Computing fib({n})...") + result = fib(n) + print(f"fib({n}) = {result}") + + +if __name__ == "__main__": + main() diff --git a/scenario3-syscall-storm/Makefile b/scenario3-syscall-storm/Makefile new file mode 100644 index 0000000..8f707bd --- /dev/null +++ b/scenario3-syscall-storm/Makefile @@ -0,0 +1,24 @@ +CC = gcc +CFLAGS = -O2 -Wall + +all: read_slow read_fast read_stdio testfile + +read_slow: read_slow.c + $(CC) $(CFLAGS) -o $@ $< + +read_fast: read_fast.c + $(CC) $(CFLAGS) -o $@ $< + +read_stdio: read_stdio.c + $(CC) $(CFLAGS) -o $@ $< + +testfile: + dd if=/dev/urandom of=testfile bs=1M count=1 + +smallfile: + dd if=/dev/urandom of=smallfile bs=10K count=1 + +clean: + rm -f read_slow read_fast read_stdio testfile smallfile + +.PHONY: all clean testfile smallfile diff --git a/scenario3-syscall-storm/README.md b/scenario3-syscall-storm/README.md new file mode 100644 index 0000000..c71f855 --- /dev/null +++ b/scenario3-syscall-storm/README.md @@ -0,0 +1,141 @@ +# Scenario 3: The Syscall Storm + +## Learning Objectives +- Understand the cost of syscalls (user-space to kernel-space transitions) +- Use `strace -c` to count and profile syscalls +- Learn why buffering matters for I/O performance +- Understand `time` output: `real`, `user`, `sys` + +## Files +- `read_slow.c` - Reads file byte-by-byte with raw `read()` syscalls +- `read_fast.c` - Reads file in 64KB chunks +- `read_stdio.c` - Uses stdio's `fgetc()` (internally buffered) +- `read_python.py` - Python equivalents + +## Setup + +```bash +# Compile all C programs and create test file +make all +``` + +## Exercise 1: Observe the Problem + +### Step 1: Time the slow version +```bash +time ./read_slow testfile +``` + +Notice that `sys` time dominates - the CPU is mostly waiting for syscalls! + +### Step 2: Count the syscalls +```bash +strace -c ./read_slow testfile +``` + +Look at: +- `calls` column for `read`: Should be ~1,000,000 (one per byte!) +- `% time` column: Most time in `read` + +### Step 3: Compare with fast version +```bash +time ./read_fast testfile +strace -c ./read_fast testfile +``` + +The `read` call count drops from ~1,000,000 to ~16 (1MB / 64KB). + +## Exercise 2: Understanding the Time Output + +```bash +time ./read_slow testfile +``` + +Output explanation: +- `real` - Wall clock time (what you'd measure with a stopwatch) +- `user` - CPU time in user space (your code running) +- `sys` - CPU time in kernel space (syscalls, I/O) + +For `read_slow`: `sys` >> `user` because we spend most time in the kernel. +For `read_fast`: `user` > `sys` because we spend more time processing data. + +## Exercise 3: strace Deep Dive + +### See individual syscalls (first 50) +```bash +strace -T ./read_slow testfile 2>&1 | head -50 +``` + +The `-T` flag shows time spent in each syscall. + +### Filter to just read() calls +```bash +strace -e read -c ./read_slow testfile +``` + +## Exercise 4: stdio Comparison + +```bash +time ./read_stdio testfile +strace -c ./read_stdio testfile +``` + +Questions: +- How many `read` syscalls does stdio make? +- Why is it still slower than `read_fast`? +- Hint: Check the default stdio buffer size (usually 4KB or 8KB) + +## Exercise 5: Python I/O + +```bash +# Create smaller file for unbuffered test +make smallfile + +# Run Python comparison +python3 read_python.py smallfile + +# Profile with strace +strace -c python3 read_python.py smallfile +``` + +## Key Insights + +### Why are syscalls expensive? +1. **Context switch**: CPU saves user state, loads kernel state +2. **Security checks**: Kernel validates permissions +3. **Memory barriers**: Caches may need flushing +4. **Scheduling**: Kernel may switch to another process + +### The buffering solution +Instead of: +``` +read(1 byte) → kernel → read(1 byte) → kernel → ... (million times) +``` + +We do: +``` +read(64KB) → kernel → process 64KB in user space → read(64KB) → ... +``` + +### Rule of thumb +- Syscall overhead: ~100-1000 nanoseconds +- Reading 64KB: ~10-100 microseconds (from cache/RAM) +- Break-even: buffer should be at least a few KB + +## perf stat Comparison + +```bash +perf stat ./read_slow testfile +perf stat ./read_fast testfile +``` + +Look at: +- `context-switches` +- `cpu-migrations` +- `instructions per cycle` + +## Further Exploration + +1. What happens with `read(fd, buf, 4096)` vs `read(fd, buf, 65536)`? +2. How does `mmap()` compare? (Memory-mapped I/O) +3. What about `O_DIRECT` flag? (Bypass page cache) diff --git a/scenario3-syscall-storm/read_fast.c b/scenario3-syscall-storm/read_fast.c new file mode 100644 index 0000000..dbdf668 --- /dev/null +++ b/scenario3-syscall-storm/read_fast.c @@ -0,0 +1,52 @@ +/* + * Scenario 3: Buffered I/O - The Fix + * =================================== + * This program reads a file in large chunks, dramatically reducing syscalls. + * + * Compile: gcc -O2 -o read_fast read_fast.c + * + * EXERCISES: + * 1. Run: time ./read_fast testfile + * 2. Compare: strace -c ./read_fast testfile + * 3. Notice the ~1000x reduction in syscalls + */ + +#include +#include +#include +#include + +#define BUFFER_SIZE 65536 /* 64KB buffer */ + +int main(int argc, char *argv[]) { + if (argc != 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + int fd = open(argv[1], O_RDONLY); + if (fd < 0) { + perror("open"); + return 1; + } + + char buffer[BUFFER_SIZE]; + unsigned long byte_count = 0; + unsigned long checksum = 0; + ssize_t bytes_read; + + /* Read in large chunks - much better! */ + while ((bytes_read = read(fd, buffer, BUFFER_SIZE)) > 0) { + for (ssize_t i = 0; i < bytes_read; i++) { + checksum += (unsigned char)buffer[i]; + } + byte_count += bytes_read; + } + + close(fd); + + printf("Read %lu bytes\n", byte_count); + printf("Checksum: %lu\n", checksum); + + return 0; +} diff --git a/scenario3-syscall-storm/read_python.py b/scenario3-syscall-storm/read_python.py new file mode 100644 index 0000000..7781030 --- /dev/null +++ b/scenario3-syscall-storm/read_python.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +""" +Scenario 3: Syscall Storm in Python +==================================== +This demonstrates buffered vs unbuffered I/O in Python. + +EXERCISES: +1. Create test file: dd if=/dev/urandom of=testfile bs=1M count=1 +2. Run: python3 read_python.py testfile +3. Profile: strace -c python3 read_python.py testfile +""" + +import sys +import os +import time + + +def read_unbuffered(filename): + """Read file byte-by-byte using os.read (raw syscalls).""" + fd = os.open(filename, os.O_RDONLY) + byte_count = 0 + checksum = 0 + + while True: + data = os.read(fd, 1) # Read 1 byte at a time! + if not data: + break + byte_count += 1 + checksum += data[0] + + os.close(fd) + return byte_count, checksum + + +def read_buffered(filename): + """Read file using Python's buffered I/O.""" + byte_count = 0 + checksum = 0 + + with open(filename, 'rb') as f: + while True: + # Python's file object is buffered internally + data = f.read(65536) # Read 64KB chunks + if not data: + break + byte_count += len(data) + checksum += sum(data) + + return byte_count, checksum + + +def read_byte_by_byte_buffered(filename): + """ + Read byte-by-byte but through Python's buffered file object. + This is slow in Python but not due to syscalls! + """ + byte_count = 0 + checksum = 0 + + with open(filename, 'rb') as f: + while True: + data = f.read(1) # Looks like 1 byte, but file is buffered + if not data: + break + byte_count += 1 + checksum += data[0] + + return byte_count, checksum + + +def benchmark(name, func, filename): + """Run and time a function.""" + start = time.perf_counter() + byte_count, checksum = func(filename) + elapsed = time.perf_counter() - start + print(f"{name:30s}: {elapsed:.3f}s ({byte_count} bytes, checksum={checksum})") + return elapsed + + +def main(): + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ") + print("\nCreate a test file with:") + print(" dd if=/dev/urandom of=testfile bs=1M count=1") + sys.exit(1) + + filename = sys.argv[1] + + print("Reading file with different strategies...\n") + + t_buf = benchmark("Buffered (64KB chunks)", read_buffered, filename) + t_byte_buf = benchmark("Byte-by-byte (buffered file)", read_byte_by_byte_buffered, filename) + + # Only run unbuffered test if file is small (< 100KB) + file_size = os.path.getsize(filename) + if file_size < 100_000: + t_unbuf = benchmark("Unbuffered (raw syscalls)", read_unbuffered, filename) + print(f"\nSpeedup (buffered vs unbuffered): {t_unbuf/t_buf:.1f}x") + else: + print(f"\nSkipping unbuffered test (file too large: {file_size} bytes)") + print("For unbuffered test, create smaller file: dd if=/dev/urandom of=smallfile bs=10K count=1") + print("Then run: strace -c python3 read_python.py smallfile") + + +if __name__ == "__main__": + main() diff --git a/scenario3-syscall-storm/read_slow.c b/scenario3-syscall-storm/read_slow.c new file mode 100644 index 0000000..226a2b1 --- /dev/null +++ b/scenario3-syscall-storm/read_slow.c @@ -0,0 +1,49 @@ +/* + * Scenario 3: The Syscall Storm - Unbuffered I/O + * =============================================== + * This program reads a file byte-by-byte using raw read() syscalls. + * Each byte triggers a context switch to the kernel - extremely slow! + * + * Compile: gcc -O2 -o read_slow read_slow.c + * + * EXERCISES: + * 1. Create a test file: dd if=/dev/urandom of=testfile bs=1M count=1 + * 2. Run: time ./read_slow testfile + * 3. Profile: strace -c ./read_slow testfile + * 4. Compare with read_fast.c + */ + +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc != 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + int fd = open(argv[1], O_RDONLY); + if (fd < 0) { + perror("open"); + return 1; + } + + char c; + unsigned long byte_count = 0; + unsigned long checksum = 0; + + /* Read one byte at a time - TERRIBLE for performance! */ + while (read(fd, &c, 1) == 1) { + byte_count++; + checksum += (unsigned char)c; + } + + close(fd); + + printf("Read %lu bytes\n", byte_count); + printf("Checksum: %lu\n", checksum); + + return 0; +} diff --git a/scenario3-syscall-storm/read_stdio.c b/scenario3-syscall-storm/read_stdio.c new file mode 100644 index 0000000..da22694 --- /dev/null +++ b/scenario3-syscall-storm/read_stdio.c @@ -0,0 +1,47 @@ +/* + * Scenario 3: stdio Buffering + * =========================== + * This version uses fgetc() which is buffered by stdio. + * It reads byte-by-byte in C code, but stdio buffers internally. + * + * Compile: gcc -O2 -o read_stdio read_stdio.c + * + * EXERCISES: + * 1. Run: time ./read_stdio testfile + * 2. Compare: strace -c ./read_stdio testfile + * 3. Notice fewer syscalls than read_slow, but more than read_fast + * 4. Why? Default stdio buffer is ~4KB, we use 64KB in read_fast + */ + +#include +#include + +int main(int argc, char *argv[]) { + if (argc != 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + FILE *fp = fopen(argv[1], "rb"); + if (!fp) { + perror("fopen"); + return 1; + } + + int c; + unsigned long byte_count = 0; + unsigned long checksum = 0; + + /* fgetc is buffered internally by stdio */ + while ((c = fgetc(fp)) != EOF) { + byte_count++; + checksum += (unsigned char)c; + } + + fclose(fp); + + printf("Read %lu bytes\n", byte_count); + printf("Checksum: %lu\n", checksum); + + return 0; +} diff --git a/scenario3-syscall-storm/testfile b/scenario3-syscall-storm/testfile new file mode 100644 index 0000000..5745cc8 Binary files /dev/null and b/scenario3-syscall-storm/testfile differ diff --git a/scenario4-cache-misses/Makefile b/scenario4-cache-misses/Makefile new file mode 100644 index 0000000..37dd626 --- /dev/null +++ b/scenario4-cache-misses/Makefile @@ -0,0 +1,15 @@ +CC = gcc +CFLAGS = -O2 -Wall + +all: cache_demo list_vs_array + +cache_demo: cache_demo.c + $(CC) $(CFLAGS) -o $@ $< + +list_vs_array: list_vs_array.c + $(CC) $(CFLAGS) -o $@ $< + +clean: + rm -f cache_demo list_vs_array + +.PHONY: all clean diff --git a/scenario4-cache-misses/README.md b/scenario4-cache-misses/README.md new file mode 100644 index 0000000..d5393cb --- /dev/null +++ b/scenario4-cache-misses/README.md @@ -0,0 +1,148 @@ +# Scenario 4: Cache Misses and Memory Access Patterns + +## Learning Objectives +- Understand CPU cache basics (L1, L2, L3) +- Use `perf stat` to measure cache behavior +- Recognize cache-friendly vs cache-hostile access patterns +- Understand why Big-O notation doesn't tell the whole story + +## Background: How CPU Caches Work + +``` +CPU Core + ↓ +L1 Cache (~32KB, ~4 cycles) + ↓ +L2 Cache (~256KB, ~12 cycles) + ↓ +L3 Cache (~8MB, ~40 cycles) + ↓ +Main RAM (~64GB, ~200 cycles) +``` + +Key concepts: +- **Cache line**: Data is loaded in chunks (typically 64 bytes) +- **Spatial locality**: If you access byte N, bytes N+1, N+2, ... are likely already cached +- **Temporal locality**: Recently accessed data is likely to be accessed again + +## Files +- `cache_demo.c` - Row-major vs column-major 2D array traversal +- `list_vs_array.c` - Array vs linked list traversal + +## Exercise 1: Row vs Column Major + +### Step 1: Build and run +```bash +make cache_demo +./cache_demo +``` + +You should see column-major is significantly slower (often 3-10x). + +### Step 2: Measure cache misses +```bash +perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo +``` + +Compare the cache miss counts and ratios. + +### Why does this happen? + +C stores 2D arrays in **row-major** order: +``` +Memory: [0][0] [0][1] [0][2] ... [0][COLS-1] [1][0] [1][1] ... + ←————— row 0 ——————→ ←—— row 1 ——→ +``` + +**Row-major access**: Sequential in memory → cache lines are fully utilized +``` +Access: [0][0] [0][1] [0][2] [0][3] ... +Cache: [████████████████] ← one cache line serves 16 ints +``` + +**Column-major access**: Jumping by COLS * sizeof(int) bytes each time +``` +Access: [0][0] [1][0] [2][0] [3][0] ... +Cache: [█_______________] ← load entire line, use 1 int, evict + [█_______________] ← repeat for each access +``` + +## Exercise 2: Array vs Linked List + +### Step 1: Build and run +```bash +make list_vs_array +./list_vs_array +``` + +### Step 2: Measure cache behavior +```bash +perf stat -e cache-misses,cache-references ./list_vs_array +``` + +### Three cases compared: + +| Case | Memory Layout | Cache Behavior | +|------|---------------|----------------| +| Array | Contiguous | Excellent - prefetcher wins | +| List (sequential) | Contiguous (lucky!) | Good - nodes happen to be adjacent | +| List (scattered) | Random | Terrible - every access misses | + +### Why "sequential list" is still slower than array: + +1. **Pointer chasing**: CPU can't prefetch next element (doesn't know address) +2. **Larger elements**: `struct node` is bigger than `int` (includes pointer) +3. **Indirect access**: Extra memory load for the `next` pointer + +## Exercise 3: Deeper perf Analysis + +### See more cache events +```bash +perf stat -e cycles,instructions,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses ./cache_demo +``` + +Events explained: +- `L1-dcache-*`: Level 1 data cache (fastest, smallest) +- `LLC-*`: Last Level Cache (L3, slowest cache before RAM) +- `cycles`: Total CPU cycles +- `instructions`: Total instructions executed +- IPC (instructions per cycle): Higher is better + +### Profile with perf record +```bash +perf record -e cache-misses ./cache_demo +perf report +``` + +This shows which functions cause the most cache misses. + +## Discussion Questions + +1. **Why doesn't the compiler fix this?** + - Compilers can sometimes interchange loops, but: + - Side effects may prevent it + - Aliasing makes it unsafe to assume + - The programmer often knows better + +2. **How big does the array need to be to see this effect?** + - If array fits in L1 cache: No difference + - If array fits in L3 cache: Moderate difference + - If array exceeds L3 cache: Dramatic difference + +3. **What about multithreaded code?** + - False sharing: Different threads accessing same cache line + - Cache coherency traffic between cores + +## Real-World Implications + +- **Image processing**: Process row-by-row, not column-by-column +- **Matrix operations**: Libraries like BLAS use cache-blocking +- **Data structures**: Arrays often beat linked lists in practice +- **Database design**: Row stores vs column stores + +## Key Takeaways + +1. **Memory access pattern matters as much as algorithm complexity** +2. **Sequential access is almost always faster than random access** +3. **Measure with `perf stat` before optimizing** +4. **Big-O notation hides constant factors that can be 10-100x** diff --git a/scenario4-cache-misses/cache_demo.c b/scenario4-cache-misses/cache_demo.c new file mode 100644 index 0000000..f1887c1 --- /dev/null +++ b/scenario4-cache-misses/cache_demo.c @@ -0,0 +1,109 @@ +/* + * Scenario 4: Cache Misses - Memory Access Patterns + * ================================================== + * This program demonstrates the performance impact of memory access patterns. + * Row-major vs column-major traversal of a 2D array. + * + * Compile: gcc -O2 -o cache_demo cache_demo.c + * + * EXERCISES: + * 1. Run: ./cache_demo + * 2. Profile: perf stat -e cache-misses,cache-references ./cache_demo + * 3. Why is one so much faster? + */ + +#include +#include +#include +#include + +#define ROWS 8192 +#define COLS 8192 + +/* + * Global array to ensure it's not optimized away. + * This is a 64MB array (8192 * 8192 * sizeof(int) = 256MB if int is 4 bytes) + * Wait, that's too big. Let's use smaller dimensions or chars. + */ + +/* Using static to avoid stack overflow */ +static int matrix[ROWS][COLS]; + +double get_time(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec / 1e9; +} + +long sum_row_major(void) { + /* + * Row-major traversal: access sequential memory addresses + * Memory layout: [0][0], [0][1], [0][2], ... [0][COLS-1], [1][0], ... + * This matches how C stores 2D arrays - CACHE FRIENDLY + */ + long sum = 0; + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + sum += matrix[i][j]; + } + } + return sum; +} + +long sum_col_major(void) { + /* + * Column-major traversal: jump around in memory + * Access pattern: [0][0], [1][0], [2][0], ... [ROWS-1][0], [0][1], ... + * Each access is COLS * sizeof(int) bytes apart - CACHE HOSTILE + */ + long sum = 0; + for (int j = 0; j < COLS; j++) { + for (int i = 0; i < ROWS; i++) { + sum += matrix[i][j]; + } + } + return sum; +} + +void init_matrix(void) { + /* Initialize with some values */ + for (int i = 0; i < ROWS; i++) { + for (int j = 0; j < COLS; j++) { + matrix[i][j] = (i + j) % 100; + } + } +} + +int main(void) { + printf("Matrix size: %d x %d = %zu bytes\n", + ROWS, COLS, sizeof(matrix)); + printf("Cache line size (typical): 64 bytes\n"); + printf("Stride in column-major: %zu bytes\n\n", COLS * sizeof(int)); + + init_matrix(); + + double start, elapsed; + long result; + + /* Warm up */ + result = sum_row_major(); + result = sum_col_major(); + + /* Row-major benchmark */ + start = get_time(); + result = sum_row_major(); + elapsed = get_time() - start; + printf("Row-major sum: %ld in %.3f seconds\n", result, elapsed); + + /* Column-major benchmark */ + start = get_time(); + result = sum_col_major(); + elapsed = get_time() - start; + printf("Column-major sum: %ld in %.3f seconds\n", result, elapsed); + + printf("\n"); + printf("To see cache misses, run:\n"); + printf(" perf stat -e cache-misses,cache-references,L1-dcache-load-misses ./cache_demo\n"); + + return 0; +} diff --git a/scenario4-cache-misses/list_vs_array.c b/scenario4-cache-misses/list_vs_array.c new file mode 100644 index 0000000..4c370e7 --- /dev/null +++ b/scenario4-cache-misses/list_vs_array.c @@ -0,0 +1,175 @@ +/* + * Scenario 4b: Array vs Linked List Traversal + * ============================================ + * Arrays have excellent cache locality; linked lists do not. + * This demonstrates why "O(n) vs O(n)" can have very different constants. + * + * Compile: gcc -O2 -o list_vs_array list_vs_array.c + */ + +#include +#include +#include + +#define N 10000000 /* 10 million elements */ + +struct node { + int value; + struct node *next; +}; + +double get_time(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec / 1e9; +} + +/* Sum array elements */ +long sum_array(int *arr, int n) { + long sum = 0; + for (int i = 0; i < n; i++) { + sum += arr[i]; + } + return sum; +} + +/* Sum linked list elements */ +long sum_list(struct node *head) { + long sum = 0; + struct node *curr = head; + while (curr != NULL) { + sum += curr->value; + curr = curr->next; + } + return sum; +} + +/* Create array */ +int *create_array(int n) { + int *arr = malloc(n * sizeof(int)); + if (!arr) { + perror("malloc array"); + exit(1); + } + for (int i = 0; i < n; i++) { + arr[i] = i % 100; + } + return arr; +} + +/* Create linked list - nodes allocated sequentially (best case for list) */ +struct node *create_list_sequential(int n) { + struct node *nodes = malloc(n * sizeof(struct node)); + if (!nodes) { + perror("malloc list"); + exit(1); + } + + for (int i = 0; i < n - 1; i++) { + nodes[i].value = i % 100; + nodes[i].next = &nodes[i + 1]; + } + nodes[n - 1].value = (n - 1) % 100; + nodes[n - 1].next = NULL; + + return nodes; +} + +/* Create linked list - nodes allocated randomly (worst case for cache) */ +struct node *create_list_scattered(int n) { + /* Allocate nodes individually to scatter them in memory */ + struct node **nodes = malloc(n * sizeof(struct node *)); + if (!nodes) { + perror("malloc"); + exit(1); + } + + /* Allocate each node separately */ + for (int i = 0; i < n; i++) { + nodes[i] = malloc(sizeof(struct node)); + if (!nodes[i]) { + perror("malloc node"); + exit(1); + } + nodes[i]->value = i % 100; + } + + /* Shuffle the order (Fisher-Yates) */ + srand(42); + for (int i = n - 1; i > 0; i--) { + int j = rand() % (i + 1); + struct node *tmp = nodes[i]; + nodes[i] = nodes[j]; + nodes[j] = tmp; + } + + /* Link them in shuffled order */ + for (int i = 0; i < n - 1; i++) { + nodes[i]->next = nodes[i + 1]; + } + nodes[n - 1]->next = NULL; + + struct node *head = nodes[0]; + free(nodes); /* Free the pointer array, not the nodes */ + return head; +} + +void free_scattered_list(struct node *head) { + while (head != NULL) { + struct node *next = head->next; + free(head); + head = next; + } +} + +int main(void) { + printf("Comparing array vs linked list traversal (%d elements)\n\n", N); + + double start, elapsed; + long result; + + /* Array */ + printf("Creating array...\n"); + int *arr = create_array(N); + + start = get_time(); + result = sum_array(arr, N); + elapsed = get_time() - start; + printf("Array sum: %ld in %.4f seconds\n", result, elapsed); + double array_time = elapsed; + free(arr); + + /* Sequential linked list (best case for list) */ + printf("\nCreating sequential linked list...\n"); + struct node *list_seq = create_list_sequential(N); + + start = get_time(); + result = sum_list(list_seq); + elapsed = get_time() - start; + printf("List sum (sequential): %ld in %.4f seconds\n", result, elapsed); + double list_seq_time = elapsed; + free(list_seq); + + /* Scattered linked list (worst case for cache) */ + printf("\nCreating scattered linked list (this takes a while)...\n"); + struct node *list_scat = create_list_scattered(N); + + start = get_time(); + result = sum_list(list_scat); + elapsed = get_time() - start; + printf("List sum (scattered): %ld in %.4f seconds\n", result, elapsed); + double list_scat_time = elapsed; + free_scattered_list(list_scat); + + printf("\n--- Summary ---\n"); + printf("Array: %.4fs (baseline)\n", array_time); + printf("List (sequential): %.4fs (%.1fx slower)\n", + list_seq_time, list_seq_time / array_time); + printf("List (scattered): %.4fs (%.1fx slower)\n", + list_scat_time, list_scat_time / array_time); + + printf("\nTo see cache behavior:\n"); + printf(" perf stat -e cache-misses,cache-references ./list_vs_array\n"); + + return 0; +} diff --git a/scenario5-debug-symbols/Makefile b/scenario5-debug-symbols/Makefile new file mode 100644 index 0000000..75dbc28 --- /dev/null +++ b/scenario5-debug-symbols/Makefile @@ -0,0 +1,40 @@ +CC = gcc +CFLAGS_BASE = -O2 -Wall + +all: nodebug withdebug + +nodebug: program.c + $(CC) $(CFLAGS_BASE) -o $@ $< -lm + +withdebug: program.c + $(CC) $(CFLAGS_BASE) -g -o $@ $< -lm + +# Show the size difference +sizes: nodebug withdebug + @echo "File sizes:" + @ls -lh nodebug withdebug + @echo "" + @echo "Section sizes (nodebug):" + @size nodebug + @echo "" + @echo "Section sizes (withdebug):" + @size withdebug + +# Show symbols +symbols: nodebug withdebug + @echo "Symbols in nodebug:" + @nm nodebug | grep -E "compute|process|main" || true + @echo "" + @echo "Symbols in withdebug:" + @nm withdebug | grep -E "compute|process|main" || true + +# Strip the debug version to show what happens +stripped: withdebug + cp withdebug stripped + strip stripped + @echo "Stripped file created" + +clean: + rm -f nodebug withdebug stripped perf.data perf.data.old + +.PHONY: all sizes symbols clean stripped diff --git a/scenario5-debug-symbols/README.md b/scenario5-debug-symbols/README.md new file mode 100644 index 0000000..fbb0659 --- /dev/null +++ b/scenario5-debug-symbols/README.md @@ -0,0 +1,179 @@ +# Scenario 5: Debug Symbols - The Missing Map + +## Learning Objectives +- Understand what debug symbols are and why they matter +- Compare profiling output with and without symbols +- Use `perf annotate` to see source-level hotspots +- Understand the trade-offs of shipping debug symbols + +## Background + +When you compile C code, the compiler translates your source into machine code. +By default, the connection between source lines and machine instructions is lost. + +**Debug symbols** (enabled with `-g`) preserve this mapping: +- Function names +- Source file names and line numbers +- Variable names and types +- Inline function information + +## Files +- `program.c` - A program with nested function calls +- `Makefile` - Builds `nodebug` and `withdebug` versions + +## Exercise 1: Build Both Versions + +```bash +make all +``` + +Compare file sizes: +```bash +make sizes +``` + +The `withdebug` binary is larger due to DWARF debug sections. + +## Exercise 2: Profile Without Debug Symbols + +```bash +perf record ./nodebug 500 5000 +perf report +``` + +You'll see something like: +``` + 45.23% nodebug nodebug [.] 0x0000000000001234 + 32.11% nodebug nodebug [.] 0x0000000000001456 + 12.45% nodebug libm.so [.] __sin_fma +``` + +The hex addresses (`0x...`) tell you nothing useful! + +## Exercise 3: Profile With Debug Symbols + +```bash +perf record ./withdebug 500 5000 +perf report +``` + +Now you see: +``` + 45.23% withdebug withdebug [.] compute_inner + 32.11% withdebug withdebug [.] compute_middle + 12.45% withdebug libm.so [.] __sin_fma +``` + +Much better! You can see which functions are hot. + +## Exercise 4: Source-Level Annotation + +With debug symbols, you can see exactly which lines are hot: + +```bash +perf record ./withdebug 500 5000 +perf annotate compute_inner +``` + +This shows the source code with cycle counts per line! + +``` + │ double compute_inner(double x, int iterations) { + │ double result = x; + 1.23 │ for (int i = 0; i < iterations; i++) { + 45.67 │ result = sin(result) * cos(result) + sqrt(fabs(result)); + 23.45 │ result = result * 1.0001 + 0.0001; + │ } + │ return result; + │ } +``` + +## Exercise 5: Understanding Symbol Tables + +```bash +# Show symbols in each binary +make symbols + +# Or manually: +nm nodebug | head -20 +nm withdebug | head -20 + +# Show more detail about sections +readelf -S withdebug | grep debug +``` + +Debug sections you might see: +- `.debug_info` - Type information +- `.debug_line` - Line number tables +- `.debug_str` - String table +- `.debug_abbrev` - Abbreviation tables + +## Exercise 6: Stripping Symbols + +Production binaries are often "stripped" to reduce size: + +```bash +make stripped +ls -l withdebug stripped + +# Try to profile the stripped binary +perf record ./stripped 500 5000 +perf report +``` + +The stripped binary loses symbol information too! + +## Exercise 7: Separate Debug Files + +In production, you can ship stripped binaries but keep debug info separate: + +```bash +# Extract debug info to separate file +objcopy --only-keep-debug withdebug withdebug.debug + +# Strip the binary +strip withdebug -o withdebug.stripped + +# Add a link to the debug file +objcopy --add-gnu-debuglink=withdebug.debug withdebug.stripped + +# Now perf can find the debug info +perf record ./withdebug.stripped 500 5000 +perf report +``` + +This is how Linux distros provide `-dbg` or `-debuginfo` packages. + +## Discussion Questions + +1. **Why don't we always compile with `-g`?** + - Binary size (can be 2-10x larger) + - Exposes source structure (security/IP concerns) + - Some optimizations may be affected (though `-O2 -g` works well) + +2. **Does `-g` affect performance?** + - Generally no: debug info is stored in separate sections + - Not loaded unless a debugger attaches + - Some edge cases with frame pointers + +3. **What about release vs debug builds?** + - Debug build: `-O0 -g` (no optimization, full debug) + - Release build: `-O2 -g` (optimized, with symbols) + - Stripped release: `-O2` then `strip` + +## Key Takeaways + +1. **Always compile with `-g` during development** +2. **Debug symbols don't meaningfully affect runtime performance** +3. **Without symbols, profilers show useless hex addresses** +4. **Production: ship stripped binaries, keep debug files for crash analysis** + +## Bonus: Flamegraph Generation + +```bash +# Record with call graph +perf record -g ./withdebug 500 5000 + +# Generate flamegraph (requires FlameGraph scripts) +perf script | /path/to/FlameGraph/stackcollapse-perf.pl | /path/to/FlameGraph/flamegraph.pl > profile.svg +``` diff --git a/scenario5-debug-symbols/program.c b/scenario5-debug-symbols/program.c new file mode 100644 index 0000000..c5a23b3 --- /dev/null +++ b/scenario5-debug-symbols/program.c @@ -0,0 +1,88 @@ +/* + * Scenario 5: Debug Symbols - The Missing Map + * ============================================ + * This program has multiple functions calling each other. + * We'll compile it with and without debug symbols to show the difference. + * + * Compile WITHOUT debug symbols: + * gcc -O2 -o nodebug program.c + * + * Compile WITH debug symbols: + * gcc -O2 -g -o withdebug program.c + * + * EXERCISES: + * 1. perf record ./nodebug && perf report (see hex addresses) + * 2. perf record ./withdebug && perf report (see function names) + * 3. perf annotate -s compute_inner (see source code!) + */ + +#include +#include +#include + +/* Deliberately NOT inlined to show in profiler */ +__attribute__((noinline)) +double compute_inner(double x, int iterations) { + double result = x; + for (int i = 0; i < iterations; i++) { + result = sin(result) * cos(result) + sqrt(fabs(result)); + result = result * 1.0001 + 0.0001; + } + return result; +} + +__attribute__((noinline)) +double compute_middle(double x, int iterations) { + double sum = 0.0; + for (int i = 0; i < 10; i++) { + sum += compute_inner(x + i, iterations); + } + return sum; +} + +__attribute__((noinline)) +double compute_outer(int n, int iterations) { + double total = 0.0; + for (int i = 0; i < n; i++) { + total += compute_middle((double)i / n, iterations); + } + return total; +} + +__attribute__((noinline)) +void process_data(int *data, int size) { + /* Some additional work to show in the profile */ + long sum = 0; + for (int i = 0; i < size; i++) { + for (int j = 0; j < 100; j++) { + sum += data[i] * j; + } + } + printf("Data checksum: %ld\n", sum % 10000); +} + +int main(int argc, char *argv[]) { + int n = 1000; + int iterations = 10000; + + if (argc > 1) n = atoi(argv[1]); + if (argc > 2) iterations = atoi(argv[2]); + + printf("Running with n=%d, iterations=%d\n", n, iterations); + + /* Allocate some data */ + int *data = malloc(n * sizeof(int)); + for (int i = 0; i < n; i++) { + data[i] = i * 17 + 3; + } + + /* Do some computation */ + double result = compute_outer(n, iterations); + printf("Computation result: %f\n", result); + + /* Process data */ + process_data(data, n); + + free(data); + return 0; +} diff --git a/scenario6-usdt-probes/Makefile b/scenario6-usdt-probes/Makefile new file mode 100644 index 0000000..e264aa0 --- /dev/null +++ b/scenario6-usdt-probes/Makefile @@ -0,0 +1,35 @@ +CC = gcc +CFLAGS = -O2 -g -Wall + +# Check if sys/sdt.h is available +SDT_CHECK := $(shell echo '\#include ' | $(CC) -E - >/dev/null 2>&1 && echo yes) + +ifeq ($(SDT_CHECK),yes) + CFLAGS += -DHAVE_SDT + SDT_STATUS = "USDT probes ENABLED" +else + SDT_STATUS = "USDT probes DISABLED (install systemtap-sdt-dev)" +endif + +all: server + @echo $(SDT_STATUS) + +server: server.c + $(CC) $(CFLAGS) -o $@ $< + +# List the probes embedded in the binary +list-probes: server + @echo "Probes in binary (requires USDT support):" + readelf -n server 2>/dev/null | grep -A2 "stapsdt" || echo "No probes found" + +# Alternative: use perf to list probes +perf-list: server + perf probe -x ./server --list 2>/dev/null || echo "No probes found or perf not supported" + +clean: + rm -f server + +install-deps: + sudo apt install systemtap-sdt-dev bpftrace + +.PHONY: all clean list-probes perf-list install-deps diff --git a/scenario6-usdt-probes/README.md b/scenario6-usdt-probes/README.md new file mode 100644 index 0000000..2da18a6 --- /dev/null +++ b/scenario6-usdt-probes/README.md @@ -0,0 +1,184 @@ +# Scenario 6: USDT Probes - Custom Instrumentation + +## Learning Objectives +- Understand the difference between static and dynamic probes +- Add USDT probes to C code +- Trace probes with bpftrace +- See how probes enable production debugging + +## Background + +**Dynamic probes** (like `perf probe`): Added at runtime, can break on any function +**Static probes** (USDT): Compiled into the binary, designed by the developer + +USDT probes are: +- Nearly zero overhead when not traced +- Stable API for debuggers/tracers +- Self-documenting: probe names describe what's happening +- Used by Python, Ruby, MySQL, PostgreSQL, and many other projects + +## Prerequisites + +```bash +# Install USDT support and bpftrace +sudo apt install systemtap-sdt-dev bpftrace +``` + +## Files +- `server.c` - Simulated server with USDT probes at key points + +## Exercise 1: Build and Run + +```bash +make +./server 3 10 +``` + +This processes 3 batches of 10 requests each. + +## Exercise 2: List the Probes + +```bash +# Using readelf +readelf -n ./server | grep -A2 stapsdt + +# Or using perf +sudo perf probe -x ./server --list +``` + +You should see probes like: +- `myserver:server_start` +- `myserver:request_start` +- `myserver:request_end` +- `myserver:request_error` + +## Exercise 3: Trace with bpftrace + +### Count requests by type +```bash +# In terminal 1: +sudo bpftrace -e ' +usdt:./server:myserver:request_start { + @types[arg1] = count(); +} +END { + print(@types); +} +' + +# In terminal 2: +./server 5 50 +``` + +### Measure request latency +```bash +sudo bpftrace -e ' +usdt:./server:myserver:request_start { + @start[arg0] = nsecs; +} +usdt:./server:myserver:request_end { + $latency = (nsecs - @start[arg0]) / 1000000; + @latency_ms = hist($latency); + delete(@start[arg0]); +} +END { + print(@latency_ms); +} +' +``` + +### Track errors +```bash +sudo bpftrace -e ' +usdt:./server:myserver:request_error { + printf("ERROR: request %d failed with code %d\n", arg0, arg1); + @errors = count(); +} +' +``` + +## Exercise 4: Trace with perf + +```bash +# Add probe +sudo perf probe -x ./server 'myserver:request_start' + +# Record +sudo perf record -e 'probe_server:*' ./server 3 20 + +# Report +sudo perf report +``` + +## How USDT Probes Work + +The `DTRACE_PROBE` macro inserts a NOP instruction: + +```c +DTRACE_PROBE2(myserver, request_start, req->id, req->type); +``` + +Compiles to something like: +```asm +nop ; placeholder for probe +``` + +When you activate tracing: +1. Tracer finds the probe location (stored in ELF notes) +2. Replaces NOP with a trap instruction (INT3 on x86) +3. Trap triggers, tracer runs, returns control +4. When tracing stops, NOP is restored + +**Overhead when not tracing**: ~0 (just a NOP) +**Overhead when tracing**: trap + handler execution + +## Real-World Uses + +### Python has USDT probes: +```bash +# If Python was built with --enable-dtrace +sudo bpftrace -e 'usdt:/usr/bin/python3:python:function__entry { printf("%s\n", str(arg1)); }' +``` + +### MySQL probes for query tracking: +```bash +sudo bpftrace -e 'usdt:/usr/sbin/mysqld:mysql:query__start { printf("%s\n", str(arg0)); }' +``` + +## Discussion Questions + +1. **When would you use USDT vs dynamic probes?** + - USDT: Known important points, stable interface + - Dynamic: Ad-hoc debugging, no source changes + +2. **What's the trade-off of adding probes?** + - Pro: Always available for debugging + - Con: Must plan ahead, adds code complexity + +3. **Why not just use printf debugging?** + - Printf has overhead even when you don't need it + - USDT has zero overhead until activated + - USDT can be traced without rebuilding + +## Advanced: Creating Custom Probes + +The probe macros from ``: + +```c +DTRACE_PROBE(provider, name) // No arguments +DTRACE_PROBE1(provider, name, arg1) // 1 argument +DTRACE_PROBE2(provider, name, arg1, arg2) // 2 arguments +// ... up to DTRACE_PROBE12 +``` + +Arguments can be integers or pointers. Strings need special handling. + +## Key Takeaways + +1. **USDT probes are designed-in observability** +2. **Zero overhead when not actively tracing** +3. **bpftrace makes probe usage easy** +4. **Many production systems already have probes (Python, databases, etc.)** + +This is an advanced topic - the main takeaway for beginners is that +such instrumentation exists and enables powerful production debugging. diff --git a/scenario6-usdt-probes/server.c b/scenario6-usdt-probes/server.c new file mode 100644 index 0000000..2d56980 --- /dev/null +++ b/scenario6-usdt-probes/server.c @@ -0,0 +1,141 @@ +/* + * Scenario 6: USDT Probes - Custom Instrumentation + * ================================================= + * This program demonstrates User Statically Defined Tracepoints (USDT). + * USDT probes allow you to add custom tracing points to your code + * that have near-zero overhead when not actively traced. + * + * Compile: + * gcc -O2 -g -o server server.c + * + * The probes use sys/sdt.h which is provided by systemtap-sdt-dev on Ubuntu. + * Install: sudo apt install systemtap-sdt-dev + * + * EXERCISES: + * 1. Run normally: ./server + * 2. List probes: perf probe -x ./server --list + * 3. Trace probes: See README.md for bpftrace examples + */ + +#include +#include +#include +#include +#include + +/* + * USDT probe macros from systemtap-sdt-dev + * If not available, we define dummy macros + */ +#ifdef HAVE_SDT +#include +#else +/* Dummy macros when SDT isn't available */ +#define DTRACE_PROBE(provider, name) +#define DTRACE_PROBE1(provider, name, arg1) +#define DTRACE_PROBE2(provider, name, arg1, arg2) +#define DTRACE_PROBE3(provider, name, arg1, arg2, arg3) +#endif + +/* Simulated request structure */ +struct request { + int id; + int type; /* 0=read, 1=write, 2=delete */ + int size; /* payload size */ + char data[64]; +}; + +/* Statistics */ +static unsigned long total_requests = 0; +static unsigned long total_errors = 0; + +/* + * Process a single request + * We add USDT probes at entry, exit, and error points + */ +int process_request(struct request *req) { + /* Probe: request processing started */ + DTRACE_PROBE2(myserver, request_start, req->id, req->type); + + total_requests++; + + /* Simulate variable processing time based on request type */ + int delay_ms; + switch (req->type) { + case 0: delay_ms = 10 + (rand() % 20); break; /* read: 10-30ms */ + case 1: delay_ms = 50 + (rand() % 50); break; /* write: 50-100ms */ + case 2: delay_ms = 5 + (rand() % 10); break; /* delete: 5-15ms */ + default: delay_ms = 100; + } + + /* Simulate some work */ + usleep(delay_ms * 1000); + + /* Simulate occasional errors (10% chance) */ + if (rand() % 10 == 0) { + DTRACE_PROBE2(myserver, request_error, req->id, -1); + total_errors++; + return -1; + } + + /* Probe: request completed successfully */ + DTRACE_PROBE3(myserver, request_end, req->id, req->type, delay_ms); + + return 0; +} + +/* + * Batch processing function + */ +void process_batch(int batch_id, int count) { + DTRACE_PROBE2(myserver, batch_start, batch_id, count); + + printf("Processing batch %d with %d requests...\n", batch_id, count); + + struct request req; + for (int i = 0; i < count; i++) { + req.id = batch_id * 1000 + i; + req.type = rand() % 3; + req.size = 64; + snprintf(req.data, sizeof(req.data), "request_%d", req.id); + + process_request(&req); + } + + DTRACE_PROBE1(myserver, batch_end, batch_id); +} + +int main(int argc, char *argv[]) { + int num_batches = 5; + int requests_per_batch = 20; + + if (argc > 1) num_batches = atoi(argv[1]); + if (argc > 2) requests_per_batch = atoi(argv[2]); + + srand(time(NULL)); + + printf("USDT Probe Demo Server\n"); + printf("======================\n"); + printf("Batches: %d, Requests per batch: %d\n\n", num_batches, requests_per_batch); + +#ifndef HAVE_SDT + printf("Note: Compiled without USDT support.\n"); + printf("To enable probes: sudo apt install systemtap-sdt-dev\n"); + printf("Then compile with: gcc -DHAVE_SDT -O2 -g -o server server.c\n\n"); +#endif + + DTRACE_PROBE(myserver, server_start); + + for (int i = 0; i < num_batches; i++) { + process_batch(i, requests_per_batch); + } + + DTRACE_PROBE(myserver, server_stop); + + printf("\n=== Summary ===\n"); + printf("Total requests: %lu\n", total_requests); + printf("Total errors: %lu (%.1f%%)\n", + total_errors, 100.0 * total_errors / total_requests); + + return 0; +} diff --git a/scenario7-pyroscope/README.md b/scenario7-pyroscope/README.md new file mode 100644 index 0000000..5666938 --- /dev/null +++ b/scenario7-pyroscope/README.md @@ -0,0 +1,195 @@ +# Scenario 7: Continuous Profiling with Pyroscope + +## Learning Objectives +- Understand the difference between one-shot and continuous profiling +- Set up and use Pyroscope for Python applications +- Navigate the Pyroscope UI to find performance issues +- Compare flamegraphs over time + +## Background + +**One-shot profiling** (what we've done so far): +- Run profiler → Execute program → Stop → Analyze +- Good for: reproducible tests, specific scenarios +- Bad for: intermittent issues, production systems + +**Continuous profiling**: +- Always running in the background +- Low overhead (~2-5% CPU) +- Aggregates data over time +- Good for: production monitoring, finding intermittent issues + +## Files +- `app.py` - Flask web application with Pyroscope instrumentation +- `loadgen.sh` - Script to generate traffic +- `requirements.txt` - Python dependencies + +## Setup + +### 1. Start Pyroscope Server + +Option A: Docker (recommended) +```bash +docker run -d --name pyroscope -p 4040:4040 grafana/pyroscope +``` + +Option B: Binary download +```bash +# Download from https://github.com/grafana/pyroscope/releases +./pyroscope server +``` + +### 2. Install Python Dependencies +```bash +pip install -r requirements.txt +# Or: pip install flask pyroscope-io +``` + +### 3. Start the Application +```bash +python3 app.py +``` + +### 4. Generate Load +```bash +chmod +x loadgen.sh +./loadgen.sh http://localhost:5000 120 # 2 minutes of load +``` + +### 5. View Profiles +Open http://localhost:4040 in your browser. + +## Exercise 1: Explore the Pyroscope UI + +1. Go to http://localhost:4040 +2. Select `workshop.flask.app` from the application dropdown +3. Observe the flamegraph + +### UI Navigation +- **Timeline**: Shows CPU usage over time, click to select time range +- **Flamegraph**: Visual representation of where time is spent +- **Table view**: Sortable list of functions by self/total time +- **Diff view**: Compare two time ranges + +## Exercise 2: Find the Hot Function + +While `loadgen.sh` is running: + +1. Look at the flamegraph +2. Find `compute_primes_slow` - it should be prominent +3. Click on it to zoom in +4. See the call stack leading to it + +## Exercise 3: Compare Cached vs Uncached + +1. Note the current time +2. Stop `loadgen.sh` +3. Modify `loadgen.sh` to only hit cached endpoints (or run manually): + ```bash + for i in $(seq 100); do + curl -s "localhost:5000/api/hash_cached/test_$((i % 5))" + done + ``` +4. In Pyroscope, compare the two time periods using the diff view + +## Exercise 4: Spot I/O-Bound Code + +1. Generate load to the slow_io endpoint: + ```bash + for i in $(seq 50); do curl -s localhost:5000/api/slow_io; done + ``` +2. Look at the flamegraph +3. Notice that `time.sleep` doesn't show up much - why? + - CPU profiling only captures CPU time + - I/O wait (sleeping, network, disk) doesn't consume CPU + - This is why I/O-bound code looks "fast" in CPU profiles! + +## Exercise 5: Timeline Analysis + +1. Let `loadgen.sh` run for several minutes +2. In Pyroscope, zoom out the timeline +3. Look for patterns: + - Spikes in CPU usage + - Changes in the flamegraph shape over time +4. Select different time ranges to compare + +## Key Pyroscope Concepts + +### Flamegraph Reading +- **Width** = proportion of total samples (time) +- **Height** = call stack depth +- **Color** = usually arbitrary (for differentiation) +- **Plateaus** = functions that are "hot" + +### Comparing Profiles +Pyroscope can show: +- **Diff view**: Red = more time, Green = less time +- Useful for before/after comparisons + +### Tags +The app uses tags for filtering: +```python +pyroscope.configure( + tags={"env": "workshop", "version": "1.0.0"} +) +``` + +You can filter by tags in the UI. + +## Production Considerations + +### Overhead +- Pyroscope Python agent: ~2-5% CPU overhead +- Sampling rate can be tuned (default: 100Hz) + +### Data Volume +- Profiles are aggregated, not stored raw +- Storage is efficient (10-100MB per day per app) + +### Security +- Profile data can reveal code structure +- Consider who has access to Pyroscope + +### Alternatives +- **Datadog Continuous Profiler** +- **AWS CodeGuru Profiler** +- **Google Cloud Profiler** +- **Parca** (open source, eBPF-based) + +## Troubleshooting + +### "No data in Pyroscope" +- Check if Pyroscope server is running: http://localhost:4040 +- Check app logs for connection errors +- Verify `pyroscope-io` is installed + +### "Profile looks empty" +- Generate more load +- The endpoint might be I/O bound (not CPU) +- Check the time range in the UI + +### High overhead +- Reduce sampling rate in pyroscope.configure() +- Check for profiling-related exceptions + +## Discussion Questions + +1. **When would you use continuous profiling vs one-shot?** + - Continuous: production, long-running apps, intermittent issues + - One-shot: development, benchmarking, specific scenarios + +2. **What can't CPU profiling show you?** + - I/O wait time + - Lock contention (mostly) + - Memory allocation patterns + +3. **How would you profile a batch job vs a web server?** + - Batch: one-shot profiling of the entire run + - Server: continuous, focus on request handling paths + +## Key Takeaways + +1. **Continuous profiling catches issues that one-shot misses** +2. **Low overhead makes it safe for production** +3. **Timeline view reveals patterns over time** +4. **CPU profiling doesn't show I/O time** diff --git a/scenario7-pyroscope/app.py b/scenario7-pyroscope/app.py new file mode 100644 index 0000000..6b6995b --- /dev/null +++ b/scenario7-pyroscope/app.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +""" +Scenario 7: Continuous Profiling with Pyroscope +=============================================== +A simple Flask web app instrumented with Pyroscope for continuous profiling. + +SETUP: +1. Start Pyroscope: docker run -p 4040:4040 grafana/pyroscope +2. Install deps: pip install flask pyroscope-io +3. Run this app: python3 app.py +4. Generate load: ./loadgen.sh (or curl in a loop) +5. View profiles: http://localhost:4040 + +The app has intentionally slow endpoints to demonstrate profiling. +""" + +import os +import time +import math +import hashlib +from functools import lru_cache + +# Try to import pyroscope, gracefully handle if not installed +try: + import pyroscope + PYROSCOPE_AVAILABLE = True +except ImportError: + PYROSCOPE_AVAILABLE = False + print("Pyroscope not installed. Run: pip install pyroscope-io") + print("Continuing without profiling...\n") + +from flask import Flask, jsonify + +app = Flask(__name__) + +# Configure Pyroscope +if PYROSCOPE_AVAILABLE: + pyroscope.configure( + application_name="workshop.flask.app", + server_address="http://localhost:4040", + # Enable profiling for specific aspects + tags={ + "env": "workshop", + "version": "1.0.0", + } + ) + + +# ============================================================ +# Endpoint 1: CPU-intensive computation +# ============================================================ + +def compute_primes_slow(n): + """Intentionally slow prime computation.""" + primes = [] + for num in range(2, n): + is_prime = True + for i in range(2, int(math.sqrt(num)) + 1): + if num % i == 0: + is_prime = False + break + if is_prime: + primes.append(num) + return primes + + +@app.route('/api/primes/') +def primes_endpoint(n): + """CPU-bound endpoint - compute primes up to n.""" + n = min(n, 50000) # Limit to prevent DoS + start = time.time() + primes = compute_primes_slow(n) + elapsed = time.time() - start + return jsonify({ + 'count': len(primes), + 'limit': n, + 'elapsed_ms': round(elapsed * 1000, 2) + }) + + +# ============================================================ +# Endpoint 2: Repeated expensive computation (needs caching) +# ============================================================ + +def expensive_hash(data, iterations=1000): + """Simulate expensive computation.""" + result = data.encode() + for _ in range(iterations): + result = hashlib.sha256(result).digest() + return result.hex() + + +@app.route('/api/hash/') +def hash_endpoint(data): + """ + This endpoint recomputes the hash every time. + Profile will show expensive_hash taking lots of time. + See hash_cached endpoint for improvement. + """ + start = time.time() + result = expensive_hash(data) + elapsed = time.time() - start + return jsonify({ + 'input': data, + 'hash': result[:16] + '...', + 'elapsed_ms': round(elapsed * 1000, 2) + }) + + +@lru_cache(maxsize=1000) +def expensive_hash_cached(data, iterations=1000): + """Cached version of expensive_hash.""" + result = data.encode() + for _ in range(iterations): + result = hashlib.sha256(result).digest() + return result.hex() + + +@app.route('/api/hash_cached/') +def hash_cached_endpoint(data): + """Cached version - compare profile with /api/hash.""" + start = time.time() + result = expensive_hash_cached(data) + elapsed = time.time() - start + return jsonify({ + 'input': data, + 'hash': result[:16] + '...', + 'elapsed_ms': round(elapsed * 1000, 2), + 'cache_info': str(expensive_hash_cached.cache_info()) + }) + + +# ============================================================ +# Endpoint 3: I/O simulation +# ============================================================ + +@app.route('/api/slow_io') +def slow_io_endpoint(): + """ + Simulate slow I/O (database query, external API, etc.) + This won't show much in CPU profiles - it's I/O bound! + """ + time.sleep(0.1) # Simulate 100ms I/O + return jsonify({'status': 'ok', 'simulated_io_ms': 100}) + + +# ============================================================ +# Endpoint 4: Mix of work types +# ============================================================ + +@app.route('/api/mixed/') +def mixed_endpoint(n): + """Mixed workload: some CPU, some I/O.""" + n = min(n, 1000) + + # CPU work + total = 0 + for i in range(n * 100): + total += math.sin(i) * math.cos(i) + + # Simulated I/O + time.sleep(0.01) + + # More CPU work + data = str(total).encode() + for _ in range(100): + data = hashlib.md5(data).digest() + + return jsonify({ + 'n': n, + 'result': data.hex()[:16] + }) + + +# ============================================================ +# Health check +# ============================================================ + +@app.route('/health') +def health(): + return jsonify({'status': 'healthy', 'pyroscope': PYROSCOPE_AVAILABLE}) + + +@app.route('/') +def index(): + return ''' +

Pyroscope Demo App

+

Endpoints:

+ +

Profiling:

+

View profiles at http://localhost:4040

+ ''' + + +if __name__ == '__main__': + print("Starting Flask app on http://localhost:5000") + print("Pyroscope dashboard: http://localhost:4040") + print("\nGenerate load with: ./loadgen.sh") + print("Or: for i in $(seq 100); do curl -s localhost:5000/api/primes/5000 > /dev/null; done") + app.run(host='0.0.0.0', port=5000, debug=False) diff --git a/scenario7-pyroscope/loadgen.sh b/scenario7-pyroscope/loadgen.sh new file mode 100644 index 0000000..c992bbf --- /dev/null +++ b/scenario7-pyroscope/loadgen.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# +# Load generator for Pyroscope demo +# Run this to generate traffic that will show up in Pyroscope +# + +BASE_URL="${1:-http://localhost:5000}" +DURATION="${2:-60}" # seconds + +echo "Generating load to $BASE_URL for $DURATION seconds" +echo "Press Ctrl+C to stop" +echo "" + +end_time=$(($(date +%s) + DURATION)) +request_count=0 + +while [ $(date +%s) -lt $end_time ]; do + # Mix of different endpoints + case $((RANDOM % 10)) in + 0|1|2|3) + # 40% - CPU intensive (primes) + n=$((1000 + RANDOM % 4000)) + curl -s "$BASE_URL/api/primes/$n" > /dev/null + ;; + 4|5) + # 20% - Hash (uncached) + data="data_$(($RANDOM % 100))" + curl -s "$BASE_URL/api/hash/$data" > /dev/null + ;; + 6|7) + # 20% - Hash (cached) + data="data_$(($RANDOM % 10))" # Smaller set for better cache hits + curl -s "$BASE_URL/api/hash_cached/$data" > /dev/null + ;; + 8) + # 10% - Slow I/O + curl -s "$BASE_URL/api/slow_io" > /dev/null + ;; + 9) + # 10% - Mixed + curl -s "$BASE_URL/api/mixed/500" > /dev/null + ;; + esac + + request_count=$((request_count + 1)) + + # Print progress every 10 requests + if [ $((request_count % 10)) -eq 0 ]; then + echo -ne "\rRequests: $request_count" + fi + + # Small delay to avoid overwhelming + sleep 0.1 +done + +echo "" +echo "Done! Total requests: $request_count" +echo "Check Pyroscope at http://localhost:4040" diff --git a/scenario7-pyroscope/requirements.txt b/scenario7-pyroscope/requirements.txt new file mode 100644 index 0000000..1d58601 --- /dev/null +++ b/scenario7-pyroscope/requirements.txt @@ -0,0 +1,2 @@ +flask>=2.0.0 +pyroscope-io>=0.8.0