Patch Mpt May 2026
batch = attention_mask.size(0)
class PatchedRotaryEmbedding(nn.Module): """Rotary embedding with cache reset on seqlen change.""" def (self, dim: int, max_seq_len: int = 2048, base: int = 10000): super(). init () self.dim = dim self.max_seq_len = max_seq_len self.base = base self._cached_cos = None self._cached_sin = None self._cached_seq_len = None patch mpt
# patches/mpt_patch_rotary_cache.py """ Patch for MPT model: - Fix rotary embedding cache when sequence length changes between forward passes. - Correct attention mask broadcasting for cross-attention layers. """ import torch import torch.nn as nn from typing import Optional, Tuple 1. Patch Rotary Embedding Cache ---------------------------------------------------------------------- def patched_rotate_half(x: torch.Tensor) -> torch.Tensor: """Split and rotate half the hidden dims (fixed for fp16 stability).""" x1, x2 = x.chunk(2, dim=-1) return torch.cat((-x2, x1), dim=-1) batch = attention_mask


