classBlockPool: def__init__(self, num_gpu_blocks: int, enable_caching: bool, ...): # All blocks in the pool self.blocks: list[KVCacheBlock] = [ KVCacheBlock(idx) for idx inrange(num_gpu_blocks) ]
defallocate(self) -> KVCacheBlock: """Allocate a block from the free queue.""" iflen(self.free_block_queue) == 0: # No free blocks - need to evict a cached block block = self._evict_cached_block() else: # Pop from free queue block = self.free_block_queue.popleft()
block.ref_cnt += 1 return block
释放:
1 2 3 4 5 6 7 8 9 10 11
deffree(self, block: KVCacheBlock) -> None: """Free a block back to the pool.""" block.ref_cnt -= 1
if block.ref_cnt == 0: if block.block_hash isnotNone: # Cached block - add to eviction queue self.free_block_queue.append(block) else: # Uncached block - immediately available self.free_block_queue.appendleft(block)
for block_idx, block_hash inenumerate(request.block_hashes): # Try to find a cached block with this hash cached_block = self.block_pool.get_cached_block(block_hash)
if cached_block isNone: # Cache miss - stop searching break
# Scan free queue from left (LRU) for block in self.free_block_queue: if block.ref_cnt == 0: # Found eviction candidate self.free_block_queue.remove(block)
# Remove from cache lookup del self.cached_block_hash_to_block[block.block_hash] block.block_hash = None
500 blocks allocated across all requests Waste: ~10 blocks (partial last blocks) Efficiency: 98%
结果:并发请求数提升 4 倍!
吞吐量提升
真实基准测试(Llama-3-8B,运行于 H100):
指标
未使用 PagedAttention
使用 PagedAttention
提升幅度
并发请求数
12
64
5.3x
吞吐量(tok/s)
1,500
8,000
5.3x
内存占用
60 GB
60 GB
持平
延迟(TTFT)
45ms
42ms
-7%
Prefix Caching 收益
使用共同系统提示词(500 个 token)时:
请求次数
未使用缓存
使用缓存
加速比
第 1 次
10ms(prefill)
10ms
1x
第 2 次
10ms
1ms
10x
第 100 次
10ms
1ms
10x
缓存命中率:对于带有系统提示词的聊天机器人,通常在 60-80% 之间。
进阶主题
滑动窗口 attention
对于像 Mistral 这样使用滑动窗口的模型:
1 2 3 4 5 6 7
# Only keep last 4096 tokens in cache if num_tokens > sliding_window_size: # Free old blocks outside window blocks_to_free = num_tokens - sliding_window_size old_blocks = blocks[:blocks_to_free // block_size] for block in old_blocks: self.block_pool.free(block)
用于 speculative decoding 的 copy-on-write
使用 speculative decoding 时:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Draft model proposes N tokens draft_tokens = draft_model.generate(n=5)
# Share blocks with draft (copy-on-write) draft_blocks = request.blocks # Same physical blocks! draft_blocks[-1].ref_cnt += 1# Increment last block
# Verify with main model verified = main_model.verify(draft_tokens)
ifnot all_verified: # Rollback - free draft blocks for block in draft_blocks[verified_idx:]: block.ref_cnt -= 1
实现中的注意事项
hash 碰撞
block 的 hash 值可能发生碰撞:
1 2 3 4 5 6
# Different token sequences with same hash (rare!) block_a = hash([1, 2, 3, ..., 16]) = 0x123456 block_b = hash([4, 5, 6, ..., 19]) = 0x123456
# Solution: Map hash to list of blocks cached_blocks[hash] = [block_a, block_b]
不完整的 block
最后一个 block 通常是不完整的:
1 2 3 4 5
# 50 tokens with block_size=16 # Block 3 only has 2 tokens (50 % 16 = 2)
# Must track: num_tokens_in_last_block # For attention: mask out unused positions
线程安全
block 分配必须是线程安全的:
1 2 3 4
classBlockPool: defallocate(self): with self.lock: # Critical section return self._allocate_unsafe()