Source code for lrnnx.ops.triton.layer_norm

"""
Reference: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/layer_norm.py
Copyright (c) 2024, Tri Dao.
Implement dropout + residual + layer_norm / rms_norm.

Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
This is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
"""

import math
import warnings

import torch
import torch.nn.functional as F
import triton
import triton.language as tl

from lrnnx.ops.torch import custom_bwd, custom_fwd


[docs] def layer_norm_ref( x, weight, bias, residual=None, x1=None, weight1=None, bias1=None, eps=1e-6, dropout_p=0.0, rowscale=None, prenorm=False, dropout_mask=None, dropout_mask1=None, upcast=False, ): """ Reference (pure PyTorch) implementation of Layer Normalization with optional residual, dropout, and parallel branches. Args: x (torch.Tensor): Input tensor. weight (torch.Tensor): Layer norm weights. bias (torch.Tensor | None): Layer norm biases. residual (torch.Tensor, optional): Residual tensor to add before normalization. Defaults to None. x1 (torch.Tensor, optional): Optional parallel input branch. Defaults to None. weight1 (torch.Tensor, optional): Weights for parallel layer norm branch. Defaults to None. bias1 (torch.Tensor, optional): Biases for parallel layer norm branch. Defaults to None. eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6. dropout_p (float, optional): Dropout probability. Defaults to 0.0. rowscale (torch.Tensor, optional): Row-wise scaling factor. Defaults to None. prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False. dropout_mask (torch.Tensor, optional): Explicit mask for dropout on x. Defaults to None. dropout_mask1 (torch.Tensor, optional): Explicit mask for dropout on x1. Defaults to None. upcast (bool, optional): Whether to cast inputs to float32 before computation. Defaults to False. Returns: torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The normalized output. If ``prenorm=True`` or ``weight1`` is provided, returns a tuple. """ dtype = x.dtype if upcast: x = x.float() weight = weight.float() bias = bias.float() if bias is not None else None residual = residual.float() if residual is not None else residual x1 = x1.float() if x1 is not None else None weight1 = weight1.float() if weight1 is not None else None bias1 = bias1.float() if bias1 is not None else None if x1 is not None: assert ( rowscale is None ), "rowscale is not supported with parallel LayerNorm" if rowscale is not None: x = x * rowscale[..., None] if dropout_p > 0.0: if dropout_mask is not None: x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p) else: x = F.dropout(x, p=dropout_p) if x1 is not None: if dropout_mask1 is not None: x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p) else: x1 = F.dropout(x1, p=dropout_p) if x1 is not None: x = x + x1 if residual is not None: x = (x + residual).to(x.dtype) out = F.layer_norm( x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps ).to(dtype) if weight1 is None: return out if not prenorm else (out, x) else: out1 = F.layer_norm( x.to(weight1.dtype), x.shape[-1:], weight=weight1, bias=bias1, eps=eps, ).to(dtype) return (out, out1) if not prenorm else (out, out1, x)
[docs] def rms_norm_ref( x, weight, bias, residual=None, x1=None, weight1=None, bias1=None, eps=1e-6, dropout_p=0.0, rowscale=None, prenorm=False, dropout_mask=None, dropout_mask1=None, upcast=False, ): """ Reference (pure PyTorch) implementation of RMS Normalization with optional residual, dropout, and parallel branches. Args: x (torch.Tensor): Input tensor. weight (torch.Tensor): RMS norm weights. bias (torch.Tensor | None): RMS norm biases (added after scaling). residual (torch.Tensor, optional): Residual tensor to add before normalization. Defaults to None. x1 (torch.Tensor, optional): Optional parallel input branch. Defaults to None. weight1 (torch.Tensor, optional): Weights for parallel RMS norm branch. Defaults to None. bias1 (torch.Tensor, optional): Biases for parallel RMS norm branch. Defaults to None. eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6. dropout_p (float, optional): Dropout probability. Defaults to 0.0. rowscale (torch.Tensor, optional): Row-wise scaling factor. Defaults to None. prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False. dropout_mask (torch.Tensor, optional): Explicit mask for dropout on x. Defaults to None. dropout_mask1 (torch.Tensor, optional): Explicit mask for dropout on x1. Defaults to None. upcast (bool, optional): Whether to cast inputs to float32 before computation. Defaults to False. Returns: torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The normalized output. If ``prenorm=True`` or ``weight1`` is provided, returns a tuple. """ dtype = x.dtype if upcast: x = x.float() weight = weight.float() bias = bias.float() if bias is not None else None residual = residual.float() if residual is not None else residual x1 = x1.float() if x1 is not None else None weight1 = weight1.float() if weight1 is not None else None bias1 = bias1.float() if bias1 is not None else None if x1 is not None: assert ( rowscale is None ), "rowscale is not supported with parallel LayerNorm" if rowscale is not None: x = x * rowscale[..., None] if dropout_p > 0.0: if dropout_mask is not None: x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p) else: x = F.dropout(x, p=dropout_p) if x1 is not None: if dropout_mask1 is not None: x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p) else: x1 = F.dropout(x1, p=dropout_p) if x1 is not None: x = x + x1 if residual is not None: x = (x + residual).to(x.dtype) rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) out = ( (x * rstd * weight) + bias if bias is not None else (x * rstd * weight) ).to(dtype) if weight1 is None: return out if not prenorm else (out, x) else: out1 = ( (x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1) ).to(dtype) return (out, out1) if not prenorm else (out, out1, x)
[docs] def config_prune(configs): """ Filters out Triton configurations that require more warps than the current device supports. Args: configs (list[triton.Config]): List of Triton kernel configurations. Returns: list[triton.Config]: Pruned list of valid configurations. """ if torch.version.hip: try: # set warp size based on gcn architecure gcn_arch_name = torch.cuda.get_device_properties(0).gcnArchName if "gfx10" in gcn_arch_name or "gfx11" in gcn_arch_name: # radeon warp_size = 32 else: # instinct warp_size = 64 except AttributeError as e: # fall back to crude method to set warp size device_name = torch.cuda.get_device_properties(0).name if "instinct" in device_name.lower(): warp_size = 64 else: warp_size = 32 warnings.warn( f"{e}, warp size set to {warp_size} based on device name: {device_name}", UserWarning, ) else: # cuda warp_size = 32 max_block_sz = 1024 max_num_warps = max_block_sz // warp_size try: return [ config for config in configs if config.num_warps <= max_num_warps ] except TypeError: # Bypass for Sphinx documentation builds where Triton is mocked return configs
configs_autotune = [ triton.Config({}, num_warps=1), triton.Config({}, num_warps=2), triton.Config({}, num_warps=4), triton.Config({}, num_warps=8), triton.Config({}, num_warps=16), triton.Config({}, num_warps=32), ] pruned_configs_autotune = config_prune(configs_autotune) @triton.autotune( configs=pruned_configs_autotune, key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"], ) @triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None}) @triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None}) @triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None}) @triton.jit def _layer_norm_fwd_1pass_kernel( X, # pointer to the input Y, # pointer to the output W, # pointer to the weights B, # pointer to the biases RESIDUAL, # pointer to the residual X1, W1, B1, Y1, RESIDUAL_OUT, # pointer to the residual ROWSCALE, SEEDS, # Dropout seeds for each row DROPOUT_MASK, Mean, # pointer to the mean Rstd, # pointer to the 1/std stride_x_row, # how much to increase the pointer when moving by 1 row stride_y_row, stride_res_row, stride_res_out_row, stride_x1_row, stride_y1_row, M, # number of rows in X N, # number of columns in X eps, # epsilon to avoid division by zero dropout_p, # Dropout probability IS_RMS_NORM: tl.constexpr, BLOCK_N: tl.constexpr, HAS_RESIDUAL: tl.constexpr, STORE_RESIDUAL_OUT: tl.constexpr, HAS_BIAS: tl.constexpr, HAS_DROPOUT: tl.constexpr, STORE_DROPOUT_MASK: tl.constexpr, HAS_ROWSCALE: tl.constexpr, HAS_X1: tl.constexpr, HAS_W1: tl.constexpr, HAS_B1: tl.constexpr, ): """ Triton JIT kernel for fused layer/rms norm forward pass. """ # Map the program id to the row of X and Y it should compute. row = tl.program_id(0) X += row * stride_x_row Y += row * stride_y_row if HAS_RESIDUAL: RESIDUAL += row * stride_res_row if STORE_RESIDUAL_OUT: RESIDUAL_OUT += row * stride_res_out_row if HAS_X1: X1 += row * stride_x1_row if HAS_W1: Y1 += row * stride_y1_row # Compute mean and variance cols = tl.arange(0, BLOCK_N) x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32) if HAS_ROWSCALE: rowscale = tl.load(ROWSCALE + row).to(tl.float32) x *= rowscale if HAS_DROPOUT: # Compute dropout mask # 7 rounds is good enough, and reduces register pressure keep_mask = ( tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p ) x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0) if STORE_DROPOUT_MASK: tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N) if HAS_X1: x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32) if HAS_ROWSCALE: rowscale = tl.load(ROWSCALE + M + row).to(tl.float32) x1 *= rowscale if HAS_DROPOUT: # Compute dropout mask # 7 rounds is good enough, and reduces register pressure keep_mask = ( tl.rand( tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7 ) > dropout_p ) x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0) if STORE_DROPOUT_MASK: tl.store( DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N, ) x += x1 if HAS_RESIDUAL: residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to( tl.float32 ) x += residual if STORE_RESIDUAL_OUT: tl.store(RESIDUAL_OUT + cols, x, mask=cols < N) if not IS_RMS_NORM: mean = tl.sum(x, axis=0) / N tl.store(Mean + row, mean) xbar = tl.where(cols < N, x - mean, 0.0) var = tl.sum(xbar * xbar, axis=0) / N else: xbar = tl.where(cols < N, x, 0.0) var = tl.sum(xbar * xbar, axis=0) / N rstd = 1 / tl.sqrt(var + eps) tl.store(Rstd + row, rstd) # Normalize and apply linear transformation mask = cols < N w = tl.load(W + cols, mask=mask).to(tl.float32) if HAS_BIAS: b = tl.load(B + cols, mask=mask).to(tl.float32) x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd y = x_hat * w + b if HAS_BIAS else x_hat * w # Write output tl.store(Y + cols, y, mask=mask) if HAS_W1: w1 = tl.load(W1 + cols, mask=mask).to(tl.float32) if HAS_B1: b1 = tl.load(B1 + cols, mask=mask).to(tl.float32) y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1 tl.store(Y1 + cols, y1, mask=mask) def _layer_norm_fwd( x, weight, bias, eps, residual=None, x1=None, weight1=None, bias1=None, dropout_p=0.0, rowscale=None, out_dtype=None, residual_dtype=None, is_rms_norm=False, return_dropout_mask=False, ): """ Python wrapper for the Triton fused layer/rms norm forward kernel. """ if residual is not None: residual_dtype = residual.dtype M, N = x.shape assert x.stride(-1) == 1 if residual is not None: assert residual.stride(-1) == 1 assert residual.shape == (M, N) assert weight.shape == (N,) assert weight.stride(-1) == 1 if bias is not None: assert bias.stride(-1) == 1 assert bias.shape == (N,) if x1 is not None: assert x1.shape == x.shape assert rowscale is None assert x1.stride(-1) == 1 if weight1 is not None: assert weight1.shape == (N,) assert weight1.stride(-1) == 1 if bias1 is not None: assert bias1.shape == (N,) assert bias1.stride(-1) == 1 if rowscale is not None: assert rowscale.is_contiguous() assert rowscale.shape == (M,) # allocate output y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype) assert y.stride(-1) == 1 if weight1 is not None: y1 = torch.empty_like(y) assert y1.stride(-1) == 1 else: y1 = None if ( residual is not None or (residual_dtype is not None and residual_dtype != x.dtype) or dropout_p > 0.0 or rowscale is not None or x1 is not None ): residual_out = torch.empty( M, N, device=x.device, dtype=residual_dtype if residual_dtype is not None else x.dtype, ) assert residual_out.stride(-1) == 1 else: residual_out = None mean = ( torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None ) rstd = torch.empty((M,), dtype=torch.float32, device=x.device) if dropout_p > 0.0: seeds = torch.randint( 2**32, (M if x1 is None else 2 * M,), device=x.device, dtype=torch.int64, ) else: seeds = None if return_dropout_mask and dropout_p > 0.0: dropout_mask = torch.empty( M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool ) else: dropout_mask = None # Less than 64KB per feature: enqueue fused kernel MAX_FUSED_SIZE = 65536 // x.element_size() BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) if N > BLOCK_N: raise RuntimeError( "This layer norm doesn't support feature dim >= 64KB." ) with torch.cuda.device(x.device.index): _layer_norm_fwd_1pass_kernel[(M,)]( x, y, weight, bias, residual, x1, weight1, bias1, y1, residual_out, rowscale, seeds, dropout_mask, mean, rstd, x.stride(0), y.stride(0), residual.stride(0) if residual is not None else 0, residual_out.stride(0) if residual_out is not None else 0, x1.stride(0) if x1 is not None else 0, y1.stride(0) if y1 is not None else 0, M, N, eps, dropout_p, is_rms_norm, BLOCK_N, residual is not None, residual_out is not None, bias is not None, dropout_p > 0.0, dropout_mask is not None, rowscale is not None, ) # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0 if dropout_mask is not None and x1 is not None: dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0) else: dropout_mask1 = None return ( y, y1, mean, rstd, residual_out if residual_out is not None else x, seeds, dropout_mask, dropout_mask1, ) @triton.autotune( configs=pruned_configs_autotune, key=[ "N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT", ], ) @triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None}) @triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None}) @triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None}) @triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None}) @triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None}) @triton.jit def _layer_norm_bwd_kernel( X, # pointer to the input W, # pointer to the weights B, # pointer to the biases Y, # pointer to the output to be recomputed DY, # pointer to the output gradient DX, # pointer to the input gradient DW, # pointer to the partial sum of weights gradient DB, # pointer to the partial sum of biases gradient DRESIDUAL, W1, DY1, DX1, DW1, DB1, DRESIDUAL_IN, ROWSCALE, SEEDS, Mean, # pointer to the mean Rstd, # pointer to the 1/std stride_x_row, # how much to increase the pointer when moving by 1 row stride_y_row, stride_dy_row, stride_dx_row, stride_dres_row, stride_dy1_row, stride_dx1_row, stride_dres_in_row, M, # number of rows in X N, # number of columns in X eps, # epsilon to avoid division by zero dropout_p, rows_per_program, IS_RMS_NORM: tl.constexpr, BLOCK_N: tl.constexpr, HAS_DRESIDUAL: tl.constexpr, STORE_DRESIDUAL: tl.constexpr, HAS_BIAS: tl.constexpr, HAS_DROPOUT: tl.constexpr, HAS_ROWSCALE: tl.constexpr, HAS_DY1: tl.constexpr, HAS_DX1: tl.constexpr, HAS_B1: tl.constexpr, RECOMPUTE_OUTPUT: tl.constexpr, ): """ Triton JIT kernel for fused layer/rms norm backward pass. """ # Map the program id to the elements of X, DX, and DY it should compute. row_block_id = tl.program_id(0) row_start = row_block_id * rows_per_program # Do not early exit if row_start >= M, because we need to write DW and DB cols = tl.arange(0, BLOCK_N) mask = cols < N X += row_start * stride_x_row if HAS_DRESIDUAL: DRESIDUAL += row_start * stride_dres_row if STORE_DRESIDUAL: DRESIDUAL_IN += row_start * stride_dres_in_row DY += row_start * stride_dy_row DX += row_start * stride_dx_row if HAS_DY1: DY1 += row_start * stride_dy1_row if HAS_DX1: DX1 += row_start * stride_dx1_row if RECOMPUTE_OUTPUT: Y += row_start * stride_y_row w = tl.load(W + cols, mask=mask).to(tl.float32) if RECOMPUTE_OUTPUT and HAS_BIAS: b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32) if HAS_DY1: w1 = tl.load(W1 + cols, mask=mask).to(tl.float32) dw = tl.zeros((BLOCK_N,), dtype=tl.float32) if HAS_BIAS: db = tl.zeros((BLOCK_N,), dtype=tl.float32) if HAS_DY1: dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32) if HAS_B1: db1 = tl.zeros((BLOCK_N,), dtype=tl.float32) row_end = min((row_block_id + 1) * rows_per_program, M) for row in range(row_start, row_end): # Load data to SRAM x = tl.load(X + cols, mask=mask, other=0).to(tl.float32) dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32) if HAS_DY1: dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32) if not IS_RMS_NORM: mean = tl.load(Mean + row) rstd = tl.load(Rstd + row) # Compute dx xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd xhat = tl.where(mask, xhat, 0.0) if RECOMPUTE_OUTPUT: y = xhat * w + b if HAS_BIAS else xhat * w tl.store(Y + cols, y, mask=mask) wdy = w * dy dw += dy * xhat if HAS_BIAS: db += dy if HAS_DY1: wdy += w1 * dy1 dw1 += dy1 * xhat if HAS_B1: db1 += dy1 if not IS_RMS_NORM: c1 = tl.sum(xhat * wdy, axis=0) / N c2 = tl.sum(wdy, axis=0) / N dx = (wdy - (xhat * c1 + c2)) * rstd else: c1 = tl.sum(xhat * wdy, axis=0) / N dx = (wdy - xhat * c1) * rstd if HAS_DRESIDUAL: dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32) dx += dres # Write dx if STORE_DRESIDUAL: tl.store(DRESIDUAL_IN + cols, dx, mask=mask) if HAS_DX1: if HAS_DROPOUT: keep_mask = ( tl.rand( tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7, ) > dropout_p ) dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0) else: dx1 = dx tl.store(DX1 + cols, dx1, mask=mask) if HAS_DROPOUT: keep_mask = ( tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p ) dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0) if HAS_ROWSCALE: rowscale = tl.load(ROWSCALE + row).to(tl.float32) dx *= rowscale tl.store(DX + cols, dx, mask=mask) X += stride_x_row if HAS_DRESIDUAL: DRESIDUAL += stride_dres_row if STORE_DRESIDUAL: DRESIDUAL_IN += stride_dres_in_row if RECOMPUTE_OUTPUT: Y += stride_y_row DY += stride_dy_row DX += stride_dx_row if HAS_DY1: DY1 += stride_dy1_row if HAS_DX1: DX1 += stride_dx1_row tl.store(DW + row_block_id * N + cols, dw, mask=mask) if HAS_BIAS: tl.store(DB + row_block_id * N + cols, db, mask=mask) if HAS_DY1: tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask) if HAS_B1: tl.store(DB1 + row_block_id * N + cols, db1, mask=mask) def _layer_norm_bwd( dy, x, weight, bias, eps, mean, rstd, dresidual=None, dy1=None, weight1=None, bias1=None, seeds=None, dropout_p=0.0, rowscale=None, has_residual=False, has_x1=False, is_rms_norm=False, x_dtype=None, recompute_output=False, ): """ Python wrapper for the Triton fused layer/rms norm backward kernel. """ M, N = x.shape assert x.stride(-1) == 1 assert dy.stride(-1) == 1 assert dy.shape == (M, N) if dresidual is not None: assert dresidual.stride(-1) == 1 assert dresidual.shape == (M, N) assert weight.shape == (N,) assert weight.stride(-1) == 1 if bias is not None: assert bias.stride(-1) == 1 assert bias.shape == (N,) if dy1 is not None: assert weight1 is not None assert dy1.shape == dy.shape assert dy1.stride(-1) == 1 if weight1 is not None: assert weight1.shape == (N,) assert weight1.stride(-1) == 1 if bias1 is not None: assert bias1.shape == (N,) assert bias1.stride(-1) == 1 if seeds is not None: assert seeds.is_contiguous() assert seeds.shape == (M if not has_x1 else M * 2,) if rowscale is not None: assert rowscale.is_contiguous() assert rowscale.shape == (M,) # allocate output dx = ( torch.empty_like(x) if x_dtype is None else torch.empty(M, N, dtype=x_dtype, device=x.device) ) dresidual_in = ( torch.empty_like(x) if has_residual and ( dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1 ) else None ) dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None y = ( torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None ) if recompute_output: assert ( weight1 is None ), "recompute_output is not supported with parallel LayerNorm" # Less than 64KB per feature: enqueue fused kernel MAX_FUSED_SIZE = 65536 // x.element_size() BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) if N > BLOCK_N: raise RuntimeError( "This layer norm doesn't support feature dim >= 64KB." ) sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device) _db = ( torch.empty((sm_count, N), dtype=torch.float32, device=bias.device) if bias is not None else None ) _dw1 = torch.empty_like(_dw) if weight1 is not None else None _db1 = torch.empty_like(_db) if bias1 is not None else None rows_per_program = math.ceil(M / sm_count) grid = (sm_count,) with torch.cuda.device(x.device.index): _layer_norm_bwd_kernel[grid]( x, weight, bias, y, dy, dx, _dw, _db, dresidual, weight1, dy1, dx1, _dw1, _db1, dresidual_in, rowscale, seeds, mean, rstd, x.stride(0), 0 if not recompute_output else y.stride(0), dy.stride(0), dx.stride(0), dresidual.stride(0) if dresidual is not None else 0, dy1.stride(0) if dy1 is not None else 0, dx1.stride(0) if dx1 is not None else 0, dresidual_in.stride(0) if dresidual_in is not None else 0, M, N, eps, dropout_p, rows_per_program, is_rms_norm, BLOCK_N, dresidual is not None, dresidual_in is not None, bias is not None, dropout_p > 0.0, ) dw = _dw.sum(0).to(weight.dtype) db = _db.sum(0).to(bias.dtype) if bias is not None else None dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None # Don't need to compute dresidual_in separately in this case if ( has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None ): dresidual_in = dx if has_x1 and dropout_p == 0.0: dx1 = dx return ( (dx, dw, db, dresidual_in, dx1, dw1, db1) if not recompute_output else (dx, dw, db, dresidual_in, dx1, dw1, db1, y) )
[docs] class LayerNormFn(torch.autograd.Function): """ Autograd function for fused Layer/RMS Normalization with optional residual connections. """
[docs] @staticmethod def forward( ctx, x, weight, bias, residual=None, x1=None, weight1=None, bias1=None, eps=1e-6, dropout_p=0.0, rowscale=None, prenorm=False, residual_in_fp32=False, is_rms_norm=False, return_dropout_mask=False, ): """ Forward pass for the LayerNormFn. Args: ctx (Any): Autograd context. x (torch.Tensor): Input tensor. weight (torch.Tensor): Normalization weights. bias (torch.Tensor | None): Normalization biases. residual (torch.Tensor, optional): Optional residual tensor. Defaults to None. x1 (torch.Tensor, optional): Optional parallel input branch. Defaults to None. weight1 (torch.Tensor, optional): Optional parallel branch weights. Defaults to None. bias1 (torch.Tensor, optional): Optional parallel branch biases. Defaults to None. eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6. dropout_p (float, optional): Dropout probability. Defaults to 0.0. rowscale (torch.Tensor, optional): Optional row-wise scaling factor. Defaults to None. prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False. residual_in_fp32 (bool, optional): Whether the residual should be maintained in FP32. Defaults to False. is_rms_norm (bool, optional): If True, computes RMS norm instead of Layer norm. Defaults to False. return_dropout_mask (bool, optional): If True, returns the generated dropout masks. Defaults to False. Returns: torch.Tensor | tuple[torch.Tensor, torch.Tensor]: Normalized output, optionally along with residual, y1, and dropout masks. """ x_shape_og = x.shape # reshape input data into 2D tensor x = x.reshape(-1, x.shape[-1]) if x.stride(-1) != 1: x = x.contiguous() if residual is not None: assert residual.shape == x_shape_og residual = residual.reshape(-1, residual.shape[-1]) if residual.stride(-1) != 1: residual = residual.contiguous() if x1 is not None: assert x1.shape == x_shape_og assert ( rowscale is None ), "rowscale is not supported with parallel LayerNorm" x1 = x1.reshape(-1, x1.shape[-1]) if x1.stride(-1) != 1: x1 = x1.contiguous() weight = weight.contiguous() if bias is not None: bias = bias.contiguous() if weight1 is not None: weight1 = weight1.contiguous() if bias1 is not None: bias1 = bias1.contiguous() if rowscale is not None: rowscale = rowscale.reshape(-1).contiguous() residual_dtype = ( residual.dtype if residual is not None else (torch.float32 if residual_in_fp32 else None) ) y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = ( _layer_norm_fwd( x, weight, bias, eps, residual, x1, weight1, bias1, dropout_p=dropout_p, rowscale=rowscale, residual_dtype=residual_dtype, is_rms_norm=is_rms_norm, return_dropout_mask=return_dropout_mask, ) ) ctx.save_for_backward( residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd, ) ctx.x_shape_og = x_shape_og ctx.eps = eps ctx.dropout_p = dropout_p ctx.is_rms_norm = is_rms_norm ctx.has_residual = residual is not None ctx.has_x1 = x1 is not None ctx.prenorm = prenorm ctx.x_dtype = x.dtype y = y.reshape(x_shape_og) y1 = y1.reshape(x_shape_og) if y1 is not None else None residual_out = ( residual_out.reshape(x_shape_og) if residual_out is not None else None ) dropout_mask = ( dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None ) dropout_mask1 = ( dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None ) if not return_dropout_mask: if weight1 is None: return y if not prenorm else (y, residual_out) else: return (y, y1) if not prenorm else (y, y1, residual_out) else: if weight1 is None: return ( (y, dropout_mask, dropout_mask1) if not prenorm else (y, residual_out, dropout_mask, dropout_mask1) ) else: return ( (y, y1, dropout_mask, dropout_mask1) if not prenorm else (y, y1, residual_out, dropout_mask, dropout_mask1) )
[docs] @staticmethod def backward(ctx, dy, *args): """ Backward pass for the LayerNormFn. Args: ctx (Any): Autograd context. dy (torch.Tensor): Gradient of the output tensor. *args: Additional gradients (e.g., dy1, dresidual). Returns: tuple: Gradients with respect to all forward inputs. """ x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ( ctx.saved_tensors ) dy = dy.reshape(-1, dy.shape[-1]) if dy.stride(-1) != 1: dy = dy.contiguous() assert dy.shape == x.shape if weight1 is not None: dy1, args = args[0], args[1:] dy1 = dy1.reshape(-1, dy1.shape[-1]) if dy1.stride(-1) != 1: dy1 = dy1.contiguous() assert dy1.shape == x.shape else: dy1 = None if ctx.prenorm: dresidual = args[0] dresidual = dresidual.reshape(-1, dresidual.shape[-1]) if dresidual.stride(-1) != 1: dresidual = dresidual.contiguous() assert dresidual.shape == x.shape else: dresidual = None dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd( dy, x, weight, bias, ctx.eps, mean, rstd, dresidual, dy1, weight1, bias1, seeds, ctx.dropout_p, rowscale, ctx.has_residual, ctx.has_x1, ctx.is_rms_norm, x_dtype=ctx.x_dtype, ) return ( dx.reshape(ctx.x_shape_og), dw, db, dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None, dx1.reshape(ctx.x_shape_og) if dx1 is not None else None, dw1, db1, None, None, None, None, None, None, None, )
[docs] def layer_norm_fn( x, weight, bias, residual=None, x1=None, weight1=None, bias1=None, eps=1e-6, dropout_p=0.0, rowscale=None, prenorm=False, residual_in_fp32=False, is_rms_norm=False, return_dropout_mask=False, ): """ Applies fused Layer Normalization using Triton. Args: x (torch.Tensor): Input tensor. weight (torch.Tensor): Normalization weights. bias (torch.Tensor | None): Normalization biases. residual (torch.Tensor, optional): Optional residual tensor to add before norm. Defaults to None. x1 (torch.Tensor, optional): Optional parallel input branch. Defaults to None. weight1 (torch.Tensor, optional): Optional parallel branch weights. Defaults to None. bias1 (torch.Tensor, optional): Optional parallel branch biases. Defaults to None. eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6. dropout_p (float, optional): Dropout probability. Defaults to 0.0. rowscale (torch.Tensor, optional): Optional row-wise scaling factor. Defaults to None. prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False. residual_in_fp32 (bool, optional): Maintain residual in FP32. Defaults to False. is_rms_norm (bool, optional): If True, computes RMS norm. Defaults to False. return_dropout_mask (bool, optional): If True, returns generated dropout masks. Defaults to False. Returns: torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The normalized output. If ``prenorm=True``, returns ``(out, prenorm_state)``. """ return LayerNormFn.apply( x, weight, bias, residual, x1, weight1, bias1, eps, dropout_p, rowscale, prenorm, residual_in_fp32, is_rms_norm, return_dropout_mask, )
[docs] def rms_norm_fn( x, weight, bias, residual=None, x1=None, weight1=None, bias1=None, eps=1e-6, dropout_p=0.0, rowscale=None, prenorm=False, residual_in_fp32=False, return_dropout_mask=False, ): """ Applies fused RMS Normalization using Triton. Args: x (torch.Tensor): Input tensor. weight (torch.Tensor): Normalization weights. bias (torch.Tensor | None): Normalization biases. residual (torch.Tensor, optional): Optional residual tensor to add before norm. Defaults to None. x1 (torch.Tensor, optional): Optional parallel input branch. Defaults to None. weight1 (torch.Tensor, optional): Optional parallel branch weights. Defaults to None. bias1 (torch.Tensor, optional): Optional parallel branch biases. Defaults to None. eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6. dropout_p (float, optional): Dropout probability. Defaults to 0.0. rowscale (torch.Tensor, optional): Optional row-wise scaling factor. Defaults to None. prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False. residual_in_fp32 (bool, optional): Maintain residual in FP32. Defaults to False. return_dropout_mask (bool, optional): If True, returns generated dropout masks. Defaults to False. Returns: torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The RMS normalized output. If ``prenorm=True``, returns ``(out, prenorm_state)``. """ return LayerNormFn.apply( x, weight, bias, residual, x1, weight1, bias1, eps, dropout_p, rowscale, prenorm, residual_in_fp32, True, return_dropout_mask, )
[docs] class RMSNorm(torch.nn.Module): """ RMS Normalization Layer. Args: hidden_size (int): Dimension of the features to normalize. eps (float, optional): Epsilon for numerical stability. Defaults to 1e-5. dropout_p (float, optional): Dropout probability. Defaults to 0.0. device (torch.device, optional): Device for parameters. Defaults to None. dtype (torch.dtype, optional): Data type for parameters. Defaults to None. """ def __init__( self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None ): factory_kwargs = {"device": device, "dtype": dtype} super().__init__() self.eps = eps if dropout_p > 0.0: self.drop = torch.nn.Dropout(dropout_p) else: self.drop = None self.weight = torch.nn.Parameter( torch.empty(hidden_size, **factory_kwargs) ) self.register_parameter("bias", None) self.reset_parameters()
[docs] def reset_parameters(self): """Initializes the weights of the layer to ones.""" torch.nn.init.ones_(self.weight)
[docs] def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False): """ Forward pass for RMSNorm. Args: x (torch.Tensor): Input tensor. residual (torch.Tensor, optional): Optional residual connection. Defaults to None. prenorm (bool, optional): Whether to return the state before normalization. Defaults to False. residual_in_fp32 (bool, optional): Compute residual in float32. Defaults to False. Returns: torch.Tensor | tuple[torch.Tensor, torch.Tensor]: Normalized output. If ``prenorm=True``, returns ``(out, prenorm_state)``. """ return rms_norm_fn( x, self.weight, self.bias, residual=residual, eps=self.eps, dropout_p=( self.drop.p if self.drop is not None and self.training else 0.0 ), prenorm=prenorm, residual_in_fp32=residual_in_fp32, )
[docs] class LayerNormLinearFn(torch.autograd.Function): """ Autograd function for a fused Layer/RMS Normalization followed immediately by a Linear projection. """ @staticmethod @custom_fwd(device_type="cuda") def forward( ctx, x, norm_weight, norm_bias, linear_weight, linear_bias, residual=None, eps=1e-6, prenorm=False, residual_in_fp32=False, is_rms_norm=False, ): """ Forward pass for the LayerNormLinearFn. Args: ctx (Any): Autograd context. x (torch.Tensor): Input tensor. norm_weight (torch.Tensor): Normalization weights. norm_bias (torch.Tensor | None): Normalization biases. linear_weight (torch.Tensor): Linear projection weights. linear_bias (torch.Tensor | None): Linear projection biases. residual (torch.Tensor, optional): Optional residual tensor. Defaults to None. eps (float, optional): Numerical stability epsilon. Defaults to 1e-6. prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False. residual_in_fp32 (bool, optional): Whether to maintain the residual in FP32. Defaults to False. is_rms_norm (bool, optional): If True, uses RMS norm instead of Layer norm. Defaults to False. Returns: torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The projected output, optionally along with the prenorm residual state. """ x_shape_og = x.shape # reshape input data into 2D tensor x = x.reshape(-1, x.shape[-1]) if x.stride(-1) != 1: x = x.contiguous() if residual is not None: assert residual.shape == x_shape_og residual = residual.reshape(-1, residual.shape[-1]) if residual.stride(-1) != 1: residual = residual.contiguous() norm_weight = norm_weight.contiguous() if norm_bias is not None: norm_bias = norm_bias.contiguous() residual_dtype = ( residual.dtype if residual is not None else (torch.float32 if residual_in_fp32 else None) ) y, _, mean, rstd, residual_out, *rest = _layer_norm_fwd( x, norm_weight, norm_bias, eps, residual, out_dtype=( None if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype() ), residual_dtype=residual_dtype, is_rms_norm=is_rms_norm, ) y = y.reshape(x_shape_og) dtype = ( torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype ) linear_weight = linear_weight.to(dtype) linear_bias = ( linear_bias.to(dtype) if linear_bias is not None else None ) out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias) # We don't store y, will be recomputed in the backward pass to save memory ctx.save_for_backward( residual_out, norm_weight, norm_bias, linear_weight, mean, rstd ) ctx.x_shape_og = x_shape_og ctx.eps = eps ctx.is_rms_norm = is_rms_norm ctx.has_residual = residual is not None ctx.prenorm = prenorm ctx.x_dtype = x.dtype ctx.linear_bias_is_none = linear_bias is None return out if not prenorm else (out, residual_out.reshape(x_shape_og)) @staticmethod @custom_bwd(device_type="cuda") def backward(ctx, dout, *args): """ Backward pass for LayerNormLinearFn. Args: ctx (Any): Autograd context. dout (torch.Tensor): Gradient of the output tensor. *args: Additional gradients (e.g., dresidual). Returns: tuple: Gradients with respect to all forward inputs. """ x, norm_weight, norm_bias, linear_weight, mean, rstd = ( ctx.saved_tensors ) dout = dout.reshape(-1, dout.shape[-1]) dy = F.linear(dout, linear_weight.t()) dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0) if dy.stride(-1) != 1: dy = dy.contiguous() assert dy.shape == x.shape if ctx.prenorm: dresidual = args[0] dresidual = dresidual.reshape(-1, dresidual.shape[-1]) if dresidual.stride(-1) != 1: dresidual = dresidual.contiguous() assert dresidual.shape == x.shape else: dresidual = None dx, dnorm_weight, dnorm_bias, dresidual_in, _, _, _, y = ( _layer_norm_bwd( dy, x, norm_weight, norm_bias, ctx.eps, mean, rstd, dresidual=dresidual, has_residual=ctx.has_residual, is_rms_norm=ctx.is_rms_norm, x_dtype=ctx.x_dtype, recompute_output=True, ) ) dlinear_weight = torch.einsum("bo,bi->oi", dout, y) return ( dx.reshape(ctx.x_shape_og), dnorm_weight, dnorm_bias, dlinear_weight, dlinear_bias, dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None, None, None, None, None, )
[docs] def layer_norm_linear_fn( x, norm_weight, norm_bias, linear_weight, linear_bias, residual=None, eps=1e-6, prenorm=False, residual_in_fp32=False, is_rms_norm=False, ): """ Applies fused Layer/RMS Normalization directly followed by a Linear projection using Triton. Args: x (torch.Tensor): Input tensor. norm_weight (torch.Tensor): Normalization weights. norm_bias (torch.Tensor | None): Normalization biases. linear_weight (torch.Tensor): Linear projection weights. linear_bias (torch.Tensor | None): Linear projection biases. residual (torch.Tensor, optional): Optional residual tensor to add before norm. Defaults to None. eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6. prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False. residual_in_fp32 (bool, optional): Maintain residual in FP32. Defaults to False. is_rms_norm (bool, optional): If True, computes RMS norm instead of Layer norm. Defaults to False. Returns: torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The projected output. If ``prenorm=True``, returns ``(out, prenorm_state)``. """ return LayerNormLinearFn.apply( x, norm_weight, norm_bias, linear_weight, linear_bias, residual, eps, prenorm, residual_in_fp32, is_rms_norm, )