Patch Mpt 🔥 Editor's Choice

# Case: (batch, key_len) -> expand to (batch, 1, 1, key_len) if attention_mask.dim() == 2: mask = attention_mask[:, None, None, :]

def _update_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype): if seq_len == self._cached_seq_len: return inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim)) t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) freqs = torch.einsum("i,j->ij", t, inv_freq) emb = torch.cat((freqs, freqs), dim=-1) self._cached_cos = emb.cos().to(dtype) self._cached_sin = emb.sin().to(dtype) self._cached_seq_len = seq_len patch mpt

# Convert to additive mask (0 = keep, -inf = mask) return mask.to(dtype).masked_fill(mask == 0, 0.0).masked_fill(mask == 1, float("-inf")) 3. Monkey-patch into existing MPT model (example) ---------------------------------------------------------------------- def apply_mpt_patches(model: nn.Module): """Replace rotary and mask functions in an existing MPT model.""" # Patch rotary class if found for name, module in model.named_modules(): if "rotary" in name.lower() and hasattr(module, "cos_cached"): module. class = PatchedRotaryEmbedding print(f"[PATCH] Replaced rotary in name") # Case: (batch, key_len) -> expand to (batch,

batch = attention_mask.size(0)