Definition: Small/fast memory built in CPUs for data that is accesses frequently
Metrics:
Hierarchy
Resource Tradeoffs
Manual: Programmer decide whether cache or not (e.g. Spark)
Expiration: stale data (data that past expiration) is deleted/revalidated. SSD is large so freshness is more important than space.
Process:
Examples
But this is the one big pattern that LRU is really bad at, when a program keeps scanning the dataset multiple times and the dataset is too big for the cache.
PROBLEM 5 MISSING
Preparation
import random
import requests
import pandas as pd
r = requests.get("<https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/stations.txt>")
r.raise_for_status()
stations = r.text.strip().split("\\n")
stations = random.sample(stations, k=10)
workload = random.choices(stations, k=100, weights=[0.3, 0.2] + [0.5/8]*8)
station = TODO
df = pd.read_csv(f"<https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz>",
names=["station", "date", "element", "value", "m", "q", "s", "obs"])
Define Variables
# Basics
cache_size = 5
cache = {} # key=stationname, value=DataFrame for that station
evict_order = [] # start of list contains items to be evicted
# stats
hits = []
import time
ms_latencies = []
Cache Simulator (FIFO Demo)
def get_station(station):
start = time.time()
if station in cache:
print("hit", end=", ") # HIT!
hits.append(True)
df = cache[station]
else:
print("miss", end=", ")
hits.append(False)
df = pd.read_csv(f"<https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz>",
names=["station", "date", "element", "value", "m", "q", "s", "obs"])
cache [station] = df
evict_order.append(station)
if len(cache) > cache_size:
victim = evict_order.pop(0)
cache.pop(victim)
end = time.time()
ms = (end-start) * 1000
ms_latencies.append(ms)
return df
Cache Simulator (LRU Demo)
def get_station(station):
start = time.time()
if station in cache:
print("hit", end=", ") # HIT!
hits.append(True)
df = cache[station]
evict_order.remove(station) # LRU
evict_order.append(station) # LRU
else:
print("miss", end=", ")
hits.append(False)
df = pd.read_csv(f"<https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz>",
names=["station", "date", "element", "value", "m", "q", "s", "obs"])
cache [station] = df
evict_order.append(station)
if len(cache) > cache_size:
victim = evict_order.pop(0)
cache.pop(victim)
end = time.time()
ms = (end-start) * 1000
ms_latencies.append(ms)
return df
Run Simulator
for station in workload:
df = get_station(station)
# print(station, evict_order)
Statistical Analysis
sum(hits) / len(hits) # hit rate
sum(ms_latencies) / len(ms_latencies) # avg latency