Source code for lrnnx.ops.triton.layer_norm

"""
Reference: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/layer_norm.py
Copyright (c) 2024, Tri Dao.
Implement dropout + residual + layer_norm / rms_norm.

Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
This is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
"""

import math
import warnings

import torch
import torch.nn.functional as F
import triton
import triton.language as tl

from lrnnx.ops.torch import custom_bwd, custom_fwd



[docs]
def layer_norm_ref(
    x,
    weight,
    bias,
    residual=None,
    x1=None,
    weight1=None,
    bias1=None,
    eps=1e-6,
    dropout_p=0.0,
    rowscale=None,
    prenorm=False,
    dropout_mask=None,
    dropout_mask1=None,
    upcast=False,
):
    """
    Reference (pure PyTorch) implementation of Layer Normalization with optional residual, dropout, and parallel branches.

    Args:
        x (torch.Tensor): Input tensor.
        weight (torch.Tensor): Layer norm weights.
        bias (torch.Tensor | None): Layer norm biases.
        residual (torch.Tensor, optional): Residual tensor to add before normalization. Defaults to None.
        x1 (torch.Tensor, optional): Optional parallel input branch. Defaults to None.
        weight1 (torch.Tensor, optional): Weights for parallel layer norm branch. Defaults to None.
        bias1 (torch.Tensor, optional): Biases for parallel layer norm branch. Defaults to None.
        eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6.
        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
        rowscale (torch.Tensor, optional): Row-wise scaling factor. Defaults to None.
        prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False.
        dropout_mask (torch.Tensor, optional): Explicit mask for dropout on x. Defaults to None.
        dropout_mask1 (torch.Tensor, optional): Explicit mask for dropout on x1. Defaults to None.
        upcast (bool, optional): Whether to cast inputs to float32 before computation. Defaults to False.

    Returns:
        torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The normalized output. If ``prenorm=True`` or ``weight1`` is provided, returns a tuple.
    """
    dtype = x.dtype
    if upcast:
        x = x.float()
        weight = weight.float()
        bias = bias.float() if bias is not None else None
        residual = residual.float() if residual is not None else residual
        x1 = x1.float() if x1 is not None else None
        weight1 = weight1.float() if weight1 is not None else None
        bias1 = bias1.float() if bias1 is not None else None
    if x1 is not None:
        assert (
            rowscale is None
        ), "rowscale is not supported with parallel LayerNorm"
    if rowscale is not None:
        x = x * rowscale[..., None]
    if dropout_p > 0.0:
        if dropout_mask is not None:
            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
        else:
            x = F.dropout(x, p=dropout_p)
        if x1 is not None:
            if dropout_mask1 is not None:
                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
            else:
                x1 = F.dropout(x1, p=dropout_p)
    if x1 is not None:
        x = x + x1
    if residual is not None:
        x = (x + residual).to(x.dtype)
    out = F.layer_norm(
        x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps
    ).to(dtype)
    if weight1 is None:
        return out if not prenorm else (out, x)
    else:
        out1 = F.layer_norm(
            x.to(weight1.dtype),
            x.shape[-1:],
            weight=weight1,
            bias=bias1,
            eps=eps,
        ).to(dtype)
        return (out, out1) if not prenorm else (out, out1, x)




[docs]
def rms_norm_ref(
    x,
    weight,
    bias,
    residual=None,
    x1=None,
    weight1=None,
    bias1=None,
    eps=1e-6,
    dropout_p=0.0,
    rowscale=None,
    prenorm=False,
    dropout_mask=None,
    dropout_mask1=None,
    upcast=False,
):
    """
    Reference (pure PyTorch) implementation of RMS Normalization with optional residual, dropout, and parallel branches.

    Args:
        x (torch.Tensor): Input tensor.
        weight (torch.Tensor): RMS norm weights.
        bias (torch.Tensor | None): RMS norm biases (added after scaling).
        residual (torch.Tensor, optional): Residual tensor to add before normalization. Defaults to None.
        x1 (torch.Tensor, optional): Optional parallel input branch. Defaults to None.
        weight1 (torch.Tensor, optional): Weights for parallel RMS norm branch. Defaults to None.
        bias1 (torch.Tensor, optional): Biases for parallel RMS norm branch. Defaults to None.
        eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6.
        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
        rowscale (torch.Tensor, optional): Row-wise scaling factor. Defaults to None.
        prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False.
        dropout_mask (torch.Tensor, optional): Explicit mask for dropout on x. Defaults to None.
        dropout_mask1 (torch.Tensor, optional): Explicit mask for dropout on x1. Defaults to None.
        upcast (bool, optional): Whether to cast inputs to float32 before computation. Defaults to False.

    Returns:
        torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The normalized output. If ``prenorm=True`` or ``weight1`` is provided, returns a tuple.
    """
    dtype = x.dtype
    if upcast:
        x = x.float()
        weight = weight.float()
        bias = bias.float() if bias is not None else None
        residual = residual.float() if residual is not None else residual
        x1 = x1.float() if x1 is not None else None
        weight1 = weight1.float() if weight1 is not None else None
        bias1 = bias1.float() if bias1 is not None else None
    if x1 is not None:
        assert (
            rowscale is None
        ), "rowscale is not supported with parallel LayerNorm"
    if rowscale is not None:
        x = x * rowscale[..., None]
    if dropout_p > 0.0:
        if dropout_mask is not None:
            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
        else:
            x = F.dropout(x, p=dropout_p)
        if x1 is not None:
            if dropout_mask1 is not None:
                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
            else:
                x1 = F.dropout(x1, p=dropout_p)
    if x1 is not None:
        x = x + x1
    if residual is not None:
        x = (x + residual).to(x.dtype)
    rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
    out = (
        (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
    ).to(dtype)
    if weight1 is None:
        return out if not prenorm else (out, x)
    else:
        out1 = (
            (x * rstd * weight1) + bias1
            if bias1 is not None
            else (x * rstd * weight1)
        ).to(dtype)
        return (out, out1) if not prenorm else (out, out1, x)




[docs]
def config_prune(configs):
    """
    Filters out Triton configurations that require more warps than the current device supports.

    Args:
        configs (list[triton.Config]): List of Triton kernel configurations.

    Returns:
        list[triton.Config]: Pruned list of valid configurations.
    """
    if torch.version.hip:
        try:
            # set warp size based on gcn architecure
            gcn_arch_name = torch.cuda.get_device_properties(0).gcnArchName
            if "gfx10" in gcn_arch_name or "gfx11" in gcn_arch_name:
                # radeon
                warp_size = 32
            else:
                # instinct
                warp_size = 64
        except AttributeError as e:
            # fall back to crude method to set warp size
            device_name = torch.cuda.get_device_properties(0).name
            if "instinct" in device_name.lower():
                warp_size = 64
            else:
                warp_size = 32
            warnings.warn(
                f"{e}, warp size set to {warp_size} based on device name: {device_name}",
                UserWarning,
            )

    else:
        # cuda
        warp_size = 32

    max_block_sz = 1024
    max_num_warps = max_block_sz // warp_size
    try:
        return [
            config for config in configs if config.num_warps <= max_num_warps
        ]
    except TypeError:
        # Bypass for Sphinx documentation builds where Triton is mocked
        return configs



configs_autotune = [
    triton.Config({}, num_warps=1),
    triton.Config({}, num_warps=2),
    triton.Config({}, num_warps=4),
    triton.Config({}, num_warps=8),
    triton.Config({}, num_warps=16),
    triton.Config({}, num_warps=32),
]

pruned_configs_autotune = config_prune(configs_autotune)


@triton.autotune(
    configs=pruned_configs_autotune,
    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
)
@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
@triton.jit
def _layer_norm_fwd_1pass_kernel(
    X,  # pointer to the input
    Y,  # pointer to the output
    W,  # pointer to the weights
    B,  # pointer to the biases
    RESIDUAL,  # pointer to the residual
    X1,
    W1,
    B1,
    Y1,
    RESIDUAL_OUT,  # pointer to the residual
    ROWSCALE,
    SEEDS,  # Dropout seeds for each row
    DROPOUT_MASK,
    Mean,  # pointer to the mean
    Rstd,  # pointer to the 1/std
    stride_x_row,  # how much to increase the pointer when moving by 1 row
    stride_y_row,
    stride_res_row,
    stride_res_out_row,
    stride_x1_row,
    stride_y1_row,
    M,  # number of rows in X
    N,  # number of columns in X
    eps,  # epsilon to avoid division by zero
    dropout_p,  # Dropout probability
    IS_RMS_NORM: tl.constexpr,
    BLOCK_N: tl.constexpr,
    HAS_RESIDUAL: tl.constexpr,
    STORE_RESIDUAL_OUT: tl.constexpr,
    HAS_BIAS: tl.constexpr,
    HAS_DROPOUT: tl.constexpr,
    STORE_DROPOUT_MASK: tl.constexpr,
    HAS_ROWSCALE: tl.constexpr,
    HAS_X1: tl.constexpr,
    HAS_W1: tl.constexpr,
    HAS_B1: tl.constexpr,
):
    """
    Triton JIT kernel for fused layer/rms norm forward pass.
    """
    # Map the program id to the row of X and Y it should compute.
    row = tl.program_id(0)
    X += row * stride_x_row
    Y += row * stride_y_row
    if HAS_RESIDUAL:
        RESIDUAL += row * stride_res_row
    if STORE_RESIDUAL_OUT:
        RESIDUAL_OUT += row * stride_res_out_row
    if HAS_X1:
        X1 += row * stride_x1_row
    if HAS_W1:
        Y1 += row * stride_y1_row
    # Compute mean and variance
    cols = tl.arange(0, BLOCK_N)
    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
    if HAS_ROWSCALE:
        rowscale = tl.load(ROWSCALE + row).to(tl.float32)
        x *= rowscale
    if HAS_DROPOUT:
        # Compute dropout mask
        # 7 rounds is good enough, and reduces register pressure
        keep_mask = (
            tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7)
            > dropout_p
        )
        x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
        if STORE_DROPOUT_MASK:
            tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
    if HAS_X1:
        x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32)
        if HAS_ROWSCALE:
            rowscale = tl.load(ROWSCALE + M + row).to(tl.float32)
            x1 *= rowscale
        if HAS_DROPOUT:
            # Compute dropout mask
            # 7 rounds is good enough, and reduces register pressure
            keep_mask = (
                tl.rand(
                    tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7
                )
                > dropout_p
            )
            x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
            if STORE_DROPOUT_MASK:
                tl.store(
                    DROPOUT_MASK + (M + row) * N + cols,
                    keep_mask,
                    mask=cols < N,
                )
        x += x1
    if HAS_RESIDUAL:
        residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(
            tl.float32
        )
        x += residual
    if STORE_RESIDUAL_OUT:
        tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
    if not IS_RMS_NORM:
        mean = tl.sum(x, axis=0) / N
        tl.store(Mean + row, mean)
        xbar = tl.where(cols < N, x - mean, 0.0)
        var = tl.sum(xbar * xbar, axis=0) / N
    else:
        xbar = tl.where(cols < N, x, 0.0)
        var = tl.sum(xbar * xbar, axis=0) / N
    rstd = 1 / tl.sqrt(var + eps)
    tl.store(Rstd + row, rstd)
    # Normalize and apply linear transformation
    mask = cols < N
    w = tl.load(W + cols, mask=mask).to(tl.float32)
    if HAS_BIAS:
        b = tl.load(B + cols, mask=mask).to(tl.float32)
    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
    y = x_hat * w + b if HAS_BIAS else x_hat * w
    # Write output
    tl.store(Y + cols, y, mask=mask)
    if HAS_W1:
        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
        if HAS_B1:
            b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
        y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
        tl.store(Y1 + cols, y1, mask=mask)


def _layer_norm_fwd(
    x,
    weight,
    bias,
    eps,
    residual=None,
    x1=None,
    weight1=None,
    bias1=None,
    dropout_p=0.0,
    rowscale=None,
    out_dtype=None,
    residual_dtype=None,
    is_rms_norm=False,
    return_dropout_mask=False,
):
    """
    Python wrapper for the Triton fused layer/rms norm forward kernel.
    """
    if residual is not None:
        residual_dtype = residual.dtype
    M, N = x.shape
    assert x.stride(-1) == 1
    if residual is not None:
        assert residual.stride(-1) == 1
        assert residual.shape == (M, N)
    assert weight.shape == (N,)
    assert weight.stride(-1) == 1
    if bias is not None:
        assert bias.stride(-1) == 1
        assert bias.shape == (N,)
    if x1 is not None:
        assert x1.shape == x.shape
        assert rowscale is None
        assert x1.stride(-1) == 1
    if weight1 is not None:
        assert weight1.shape == (N,)
        assert weight1.stride(-1) == 1
    if bias1 is not None:
        assert bias1.shape == (N,)
        assert bias1.stride(-1) == 1
    if rowscale is not None:
        assert rowscale.is_contiguous()
        assert rowscale.shape == (M,)
    # allocate output
    y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
    assert y.stride(-1) == 1
    if weight1 is not None:
        y1 = torch.empty_like(y)
        assert y1.stride(-1) == 1
    else:
        y1 = None
    if (
        residual is not None
        or (residual_dtype is not None and residual_dtype != x.dtype)
        or dropout_p > 0.0
        or rowscale is not None
        or x1 is not None
    ):
        residual_out = torch.empty(
            M,
            N,
            device=x.device,
            dtype=residual_dtype if residual_dtype is not None else x.dtype,
        )
        assert residual_out.stride(-1) == 1
    else:
        residual_out = None
    mean = (
        torch.empty((M,), dtype=torch.float32, device=x.device)
        if not is_rms_norm
        else None
    )
    rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
    if dropout_p > 0.0:
        seeds = torch.randint(
            2**32,
            (M if x1 is None else 2 * M,),
            device=x.device,
            dtype=torch.int64,
        )
    else:
        seeds = None
    if return_dropout_mask and dropout_p > 0.0:
        dropout_mask = torch.empty(
            M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool
        )
    else:
        dropout_mask = None
    # Less than 64KB per feature: enqueue fused kernel
    MAX_FUSED_SIZE = 65536 // x.element_size()
    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
    if N > BLOCK_N:
        raise RuntimeError(
            "This layer norm doesn't support feature dim >= 64KB."
        )
    with torch.cuda.device(x.device.index):
        _layer_norm_fwd_1pass_kernel[(M,)](
            x,
            y,
            weight,
            bias,
            residual,
            x1,
            weight1,
            bias1,
            y1,
            residual_out,
            rowscale,
            seeds,
            dropout_mask,
            mean,
            rstd,
            x.stride(0),
            y.stride(0),
            residual.stride(0) if residual is not None else 0,
            residual_out.stride(0) if residual_out is not None else 0,
            x1.stride(0) if x1 is not None else 0,
            y1.stride(0) if y1 is not None else 0,
            M,
            N,
            eps,
            dropout_p,
            is_rms_norm,
            BLOCK_N,
            residual is not None,
            residual_out is not None,
            bias is not None,
            dropout_p > 0.0,
            dropout_mask is not None,
            rowscale is not None,
        )
    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
    if dropout_mask is not None and x1 is not None:
        dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
    else:
        dropout_mask1 = None
    return (
        y,
        y1,
        mean,
        rstd,
        residual_out if residual_out is not None else x,
        seeds,
        dropout_mask,
        dropout_mask1,
    )


@triton.autotune(
    configs=pruned_configs_autotune,
    key=[
        "N",
        "HAS_DRESIDUAL",
        "STORE_DRESIDUAL",
        "IS_RMS_NORM",
        "HAS_BIAS",
        "HAS_DROPOUT",
    ],
)
@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
@triton.jit
def _layer_norm_bwd_kernel(
    X,  # pointer to the input
    W,  # pointer to the weights
    B,  # pointer to the biases
    Y,  # pointer to the output to be recomputed
    DY,  # pointer to the output gradient
    DX,  # pointer to the input gradient
    DW,  # pointer to the partial sum of weights gradient
    DB,  # pointer to the partial sum of biases gradient
    DRESIDUAL,
    W1,
    DY1,
    DX1,
    DW1,
    DB1,
    DRESIDUAL_IN,
    ROWSCALE,
    SEEDS,
    Mean,  # pointer to the mean
    Rstd,  # pointer to the 1/std
    stride_x_row,  # how much to increase the pointer when moving by 1 row
    stride_y_row,
    stride_dy_row,
    stride_dx_row,
    stride_dres_row,
    stride_dy1_row,
    stride_dx1_row,
    stride_dres_in_row,
    M,  # number of rows in X
    N,  # number of columns in X
    eps,  # epsilon to avoid division by zero
    dropout_p,
    rows_per_program,
    IS_RMS_NORM: tl.constexpr,
    BLOCK_N: tl.constexpr,
    HAS_DRESIDUAL: tl.constexpr,
    STORE_DRESIDUAL: tl.constexpr,
    HAS_BIAS: tl.constexpr,
    HAS_DROPOUT: tl.constexpr,
    HAS_ROWSCALE: tl.constexpr,
    HAS_DY1: tl.constexpr,
    HAS_DX1: tl.constexpr,
    HAS_B1: tl.constexpr,
    RECOMPUTE_OUTPUT: tl.constexpr,
):
    """
    Triton JIT kernel for fused layer/rms norm backward pass.
    """
    # Map the program id to the elements of X, DX, and DY it should compute.
    row_block_id = tl.program_id(0)
    row_start = row_block_id * rows_per_program
    # Do not early exit if row_start >= M, because we need to write DW and DB
    cols = tl.arange(0, BLOCK_N)
    mask = cols < N
    X += row_start * stride_x_row
    if HAS_DRESIDUAL:
        DRESIDUAL += row_start * stride_dres_row
    if STORE_DRESIDUAL:
        DRESIDUAL_IN += row_start * stride_dres_in_row
    DY += row_start * stride_dy_row
    DX += row_start * stride_dx_row
    if HAS_DY1:
        DY1 += row_start * stride_dy1_row
    if HAS_DX1:
        DX1 += row_start * stride_dx1_row
    if RECOMPUTE_OUTPUT:
        Y += row_start * stride_y_row
    w = tl.load(W + cols, mask=mask).to(tl.float32)
    if RECOMPUTE_OUTPUT and HAS_BIAS:
        b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
    if HAS_DY1:
        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
    if HAS_BIAS:
        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
    if HAS_DY1:
        dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
        if HAS_B1:
            db1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
    row_end = min((row_block_id + 1) * rows_per_program, M)
    for row in range(row_start, row_end):
        # Load data to SRAM
        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
        if HAS_DY1:
            dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32)
        if not IS_RMS_NORM:
            mean = tl.load(Mean + row)
        rstd = tl.load(Rstd + row)
        # Compute dx
        xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
        xhat = tl.where(mask, xhat, 0.0)
        if RECOMPUTE_OUTPUT:
            y = xhat * w + b if HAS_BIAS else xhat * w
            tl.store(Y + cols, y, mask=mask)
        wdy = w * dy
        dw += dy * xhat
        if HAS_BIAS:
            db += dy
        if HAS_DY1:
            wdy += w1 * dy1
            dw1 += dy1 * xhat
            if HAS_B1:
                db1 += dy1
        if not IS_RMS_NORM:
            c1 = tl.sum(xhat * wdy, axis=0) / N
            c2 = tl.sum(wdy, axis=0) / N
            dx = (wdy - (xhat * c1 + c2)) * rstd
        else:
            c1 = tl.sum(xhat * wdy, axis=0) / N
            dx = (wdy - xhat * c1) * rstd
        if HAS_DRESIDUAL:
            dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
            dx += dres
        # Write dx
        if STORE_DRESIDUAL:
            tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
        if HAS_DX1:
            if HAS_DROPOUT:
                keep_mask = (
                    tl.rand(
                        tl.load(SEEDS + M + row).to(tl.uint32),
                        cols,
                        n_rounds=7,
                    )
                    > dropout_p
                )
                dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
            else:
                dx1 = dx
            tl.store(DX1 + cols, dx1, mask=mask)
        if HAS_DROPOUT:
            keep_mask = (
                tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7)
                > dropout_p
            )
            dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
        if HAS_ROWSCALE:
            rowscale = tl.load(ROWSCALE + row).to(tl.float32)
            dx *= rowscale
        tl.store(DX + cols, dx, mask=mask)

        X += stride_x_row
        if HAS_DRESIDUAL:
            DRESIDUAL += stride_dres_row
        if STORE_DRESIDUAL:
            DRESIDUAL_IN += stride_dres_in_row
        if RECOMPUTE_OUTPUT:
            Y += stride_y_row
        DY += stride_dy_row
        DX += stride_dx_row
        if HAS_DY1:
            DY1 += stride_dy1_row
        if HAS_DX1:
            DX1 += stride_dx1_row
    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
    if HAS_BIAS:
        tl.store(DB + row_block_id * N + cols, db, mask=mask)
    if HAS_DY1:
        tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask)
        if HAS_B1:
            tl.store(DB1 + row_block_id * N + cols, db1, mask=mask)


def _layer_norm_bwd(
    dy,
    x,
    weight,
    bias,
    eps,
    mean,
    rstd,
    dresidual=None,
    dy1=None,
    weight1=None,
    bias1=None,
    seeds=None,
    dropout_p=0.0,
    rowscale=None,
    has_residual=False,
    has_x1=False,
    is_rms_norm=False,
    x_dtype=None,
    recompute_output=False,
):
    """
    Python wrapper for the Triton fused layer/rms norm backward kernel.
    """
    M, N = x.shape
    assert x.stride(-1) == 1
    assert dy.stride(-1) == 1
    assert dy.shape == (M, N)
    if dresidual is not None:
        assert dresidual.stride(-1) == 1
        assert dresidual.shape == (M, N)
    assert weight.shape == (N,)
    assert weight.stride(-1) == 1
    if bias is not None:
        assert bias.stride(-1) == 1
        assert bias.shape == (N,)
    if dy1 is not None:
        assert weight1 is not None
        assert dy1.shape == dy.shape
        assert dy1.stride(-1) == 1
    if weight1 is not None:
        assert weight1.shape == (N,)
        assert weight1.stride(-1) == 1
    if bias1 is not None:
        assert bias1.shape == (N,)
        assert bias1.stride(-1) == 1
    if seeds is not None:
        assert seeds.is_contiguous()
        assert seeds.shape == (M if not has_x1 else M * 2,)
    if rowscale is not None:
        assert rowscale.is_contiguous()
        assert rowscale.shape == (M,)
    # allocate output
    dx = (
        torch.empty_like(x)
        if x_dtype is None
        else torch.empty(M, N, dtype=x_dtype, device=x.device)
    )
    dresidual_in = (
        torch.empty_like(x)
        if has_residual
        and (
            dx.dtype != x.dtype
            or dropout_p > 0.0
            or rowscale is not None
            or has_x1
        )
        else None
    )
    dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
    y = (
        torch.empty(M, N, dtype=dy.dtype, device=dy.device)
        if recompute_output
        else None
    )
    if recompute_output:
        assert (
            weight1 is None
        ), "recompute_output is not supported with parallel LayerNorm"

    # Less than 64KB per feature: enqueue fused kernel
    MAX_FUSED_SIZE = 65536 // x.element_size()
    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
    if N > BLOCK_N:
        raise RuntimeError(
            "This layer norm doesn't support feature dim >= 64KB."
        )
    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
    _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
    _db = (
        torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
        if bias is not None
        else None
    )
    _dw1 = torch.empty_like(_dw) if weight1 is not None else None
    _db1 = torch.empty_like(_db) if bias1 is not None else None
    rows_per_program = math.ceil(M / sm_count)
    grid = (sm_count,)
    with torch.cuda.device(x.device.index):
        _layer_norm_bwd_kernel[grid](
            x,
            weight,
            bias,
            y,
            dy,
            dx,
            _dw,
            _db,
            dresidual,
            weight1,
            dy1,
            dx1,
            _dw1,
            _db1,
            dresidual_in,
            rowscale,
            seeds,
            mean,
            rstd,
            x.stride(0),
            0 if not recompute_output else y.stride(0),
            dy.stride(0),
            dx.stride(0),
            dresidual.stride(0) if dresidual is not None else 0,
            dy1.stride(0) if dy1 is not None else 0,
            dx1.stride(0) if dx1 is not None else 0,
            dresidual_in.stride(0) if dresidual_in is not None else 0,
            M,
            N,
            eps,
            dropout_p,
            rows_per_program,
            is_rms_norm,
            BLOCK_N,
            dresidual is not None,
            dresidual_in is not None,
            bias is not None,
            dropout_p > 0.0,
        )
    dw = _dw.sum(0).to(weight.dtype)
    db = _db.sum(0).to(bias.dtype) if bias is not None else None
    dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
    db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
    # Don't need to compute dresidual_in separately in this case
    if (
        has_residual
        and dx.dtype == x.dtype
        and dropout_p == 0.0
        and rowscale is None
    ):
        dresidual_in = dx
    if has_x1 and dropout_p == 0.0:
        dx1 = dx
    return (
        (dx, dw, db, dresidual_in, dx1, dw1, db1)
        if not recompute_output
        else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
    )



[docs]
class LayerNormFn(torch.autograd.Function):
    """
    Autograd function for fused Layer/RMS Normalization with optional residual connections.
    """


[docs]
    @staticmethod
    def forward(
        ctx,
        x,
        weight,
        bias,
        residual=None,
        x1=None,
        weight1=None,
        bias1=None,
        eps=1e-6,
        dropout_p=0.0,
        rowscale=None,
        prenorm=False,
        residual_in_fp32=False,
        is_rms_norm=False,
        return_dropout_mask=False,
    ):
        """
        Forward pass for the LayerNormFn.

        Args:
            ctx (Any): Autograd context.
            x (torch.Tensor): Input tensor.
            weight (torch.Tensor): Normalization weights.
            bias (torch.Tensor | None): Normalization biases.
            residual (torch.Tensor, optional): Optional residual tensor. Defaults to None.
            x1 (torch.Tensor, optional): Optional parallel input branch. Defaults to None.
            weight1 (torch.Tensor, optional): Optional parallel branch weights. Defaults to None.
            bias1 (torch.Tensor, optional): Optional parallel branch biases. Defaults to None.
            eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6.
            dropout_p (float, optional): Dropout probability. Defaults to 0.0.
            rowscale (torch.Tensor, optional): Optional row-wise scaling factor. Defaults to None.
            prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False.
            residual_in_fp32 (bool, optional): Whether the residual should be maintained in FP32. Defaults to False.
            is_rms_norm (bool, optional): If True, computes RMS norm instead of Layer norm. Defaults to False.
            return_dropout_mask (bool, optional): If True, returns the generated dropout masks. Defaults to False.

        Returns:
            torch.Tensor | tuple[torch.Tensor, torch.Tensor]: Normalized output, optionally along with residual, y1, and dropout masks.
        """
        x_shape_og = x.shape
        # reshape input data into 2D tensor
        x = x.reshape(-1, x.shape[-1])
        if x.stride(-1) != 1:
            x = x.contiguous()
        if residual is not None:
            assert residual.shape == x_shape_og
            residual = residual.reshape(-1, residual.shape[-1])
            if residual.stride(-1) != 1:
                residual = residual.contiguous()
        if x1 is not None:
            assert x1.shape == x_shape_og
            assert (
                rowscale is None
            ), "rowscale is not supported with parallel LayerNorm"
            x1 = x1.reshape(-1, x1.shape[-1])
            if x1.stride(-1) != 1:
                x1 = x1.contiguous()
        weight = weight.contiguous()
        if bias is not None:
            bias = bias.contiguous()
        if weight1 is not None:
            weight1 = weight1.contiguous()
        if bias1 is not None:
            bias1 = bias1.contiguous()
        if rowscale is not None:
            rowscale = rowscale.reshape(-1).contiguous()
        residual_dtype = (
            residual.dtype
            if residual is not None
            else (torch.float32 if residual_in_fp32 else None)
        )
        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = (
            _layer_norm_fwd(
                x,
                weight,
                bias,
                eps,
                residual,
                x1,
                weight1,
                bias1,
                dropout_p=dropout_p,
                rowscale=rowscale,
                residual_dtype=residual_dtype,
                is_rms_norm=is_rms_norm,
                return_dropout_mask=return_dropout_mask,
            )
        )
        ctx.save_for_backward(
            residual_out,
            weight,
            bias,
            weight1,
            bias1,
            rowscale,
            seeds,
            mean,
            rstd,
        )
        ctx.x_shape_og = x_shape_og
        ctx.eps = eps
        ctx.dropout_p = dropout_p
        ctx.is_rms_norm = is_rms_norm
        ctx.has_residual = residual is not None
        ctx.has_x1 = x1 is not None
        ctx.prenorm = prenorm
        ctx.x_dtype = x.dtype
        y = y.reshape(x_shape_og)
        y1 = y1.reshape(x_shape_og) if y1 is not None else None
        residual_out = (
            residual_out.reshape(x_shape_og)
            if residual_out is not None
            else None
        )
        dropout_mask = (
            dropout_mask.reshape(x_shape_og)
            if dropout_mask is not None
            else None
        )
        dropout_mask1 = (
            dropout_mask1.reshape(x_shape_og)
            if dropout_mask1 is not None
            else None
        )
        if not return_dropout_mask:
            if weight1 is None:
                return y if not prenorm else (y, residual_out)
            else:
                return (y, y1) if not prenorm else (y, y1, residual_out)
        else:
            if weight1 is None:
                return (
                    (y, dropout_mask, dropout_mask1)
                    if not prenorm
                    else (y, residual_out, dropout_mask, dropout_mask1)
                )
            else:
                return (
                    (y, y1, dropout_mask, dropout_mask1)
                    if not prenorm
                    else (y, y1, residual_out, dropout_mask, dropout_mask1)
                )



[docs]
    @staticmethod
    def backward(ctx, dy, *args):
        """
        Backward pass for the LayerNormFn.

        Args:
            ctx (Any): Autograd context.
            dy (torch.Tensor): Gradient of the output tensor.
            *args: Additional gradients (e.g., dy1, dresidual).

        Returns:
            tuple: Gradients with respect to all forward inputs.
        """
        x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = (
            ctx.saved_tensors
        )
        dy = dy.reshape(-1, dy.shape[-1])
        if dy.stride(-1) != 1:
            dy = dy.contiguous()
        assert dy.shape == x.shape
        if weight1 is not None:
            dy1, args = args[0], args[1:]
            dy1 = dy1.reshape(-1, dy1.shape[-1])
            if dy1.stride(-1) != 1:
                dy1 = dy1.contiguous()
            assert dy1.shape == x.shape
        else:
            dy1 = None
        if ctx.prenorm:
            dresidual = args[0]
            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
            if dresidual.stride(-1) != 1:
                dresidual = dresidual.contiguous()
            assert dresidual.shape == x.shape
        else:
            dresidual = None
        dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
            dy,
            x,
            weight,
            bias,
            ctx.eps,
            mean,
            rstd,
            dresidual,
            dy1,
            weight1,
            bias1,
            seeds,
            ctx.dropout_p,
            rowscale,
            ctx.has_residual,
            ctx.has_x1,
            ctx.is_rms_norm,
            x_dtype=ctx.x_dtype,
        )
        return (
            dx.reshape(ctx.x_shape_og),
            dw,
            db,
            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
            dx1.reshape(ctx.x_shape_og) if dx1 is not None else None,
            dw1,
            db1,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        )





[docs]
def layer_norm_fn(
    x,
    weight,
    bias,
    residual=None,
    x1=None,
    weight1=None,
    bias1=None,
    eps=1e-6,
    dropout_p=0.0,
    rowscale=None,
    prenorm=False,
    residual_in_fp32=False,
    is_rms_norm=False,
    return_dropout_mask=False,
):
    """
    Applies fused Layer Normalization using Triton.

    Args:
        x (torch.Tensor): Input tensor.
        weight (torch.Tensor): Normalization weights.
        bias (torch.Tensor | None): Normalization biases.
        residual (torch.Tensor, optional): Optional residual tensor to add before norm. Defaults to None.
        x1 (torch.Tensor, optional): Optional parallel input branch. Defaults to None.
        weight1 (torch.Tensor, optional): Optional parallel branch weights. Defaults to None.
        bias1 (torch.Tensor, optional): Optional parallel branch biases. Defaults to None.
        eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6.
        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
        rowscale (torch.Tensor, optional): Optional row-wise scaling factor. Defaults to None.
        prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False.
        residual_in_fp32 (bool, optional): Maintain residual in FP32. Defaults to False.
        is_rms_norm (bool, optional): If True, computes RMS norm. Defaults to False.
        return_dropout_mask (bool, optional): If True, returns generated dropout masks. Defaults to False.

    Returns:
        torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The normalized output. If ``prenorm=True``, returns ``(out, prenorm_state)``.
    """
    return LayerNormFn.apply(
        x,
        weight,
        bias,
        residual,
        x1,
        weight1,
        bias1,
        eps,
        dropout_p,
        rowscale,
        prenorm,
        residual_in_fp32,
        is_rms_norm,
        return_dropout_mask,
    )




[docs]
def rms_norm_fn(
    x,
    weight,
    bias,
    residual=None,
    x1=None,
    weight1=None,
    bias1=None,
    eps=1e-6,
    dropout_p=0.0,
    rowscale=None,
    prenorm=False,
    residual_in_fp32=False,
    return_dropout_mask=False,
):
    """
    Applies fused RMS Normalization using Triton.

    Args:
        x (torch.Tensor): Input tensor.
        weight (torch.Tensor): Normalization weights.
        bias (torch.Tensor | None): Normalization biases.
        residual (torch.Tensor, optional): Optional residual tensor to add before norm. Defaults to None.
        x1 (torch.Tensor, optional): Optional parallel input branch. Defaults to None.
        weight1 (torch.Tensor, optional): Optional parallel branch weights. Defaults to None.
        bias1 (torch.Tensor, optional): Optional parallel branch biases. Defaults to None.
        eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6.
        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
        rowscale (torch.Tensor, optional): Optional row-wise scaling factor. Defaults to None.
        prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False.
        residual_in_fp32 (bool, optional): Maintain residual in FP32. Defaults to False.
        return_dropout_mask (bool, optional): If True, returns generated dropout masks. Defaults to False.

    Returns:
        torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The RMS normalized output. If ``prenorm=True``, returns ``(out, prenorm_state)``.
    """
    return LayerNormFn.apply(
        x,
        weight,
        bias,
        residual,
        x1,
        weight1,
        bias1,
        eps,
        dropout_p,
        rowscale,
        prenorm,
        residual_in_fp32,
        True,
        return_dropout_mask,
    )




[docs]
class RMSNorm(torch.nn.Module):
    """
    RMS Normalization Layer.

    Args:
        hidden_size (int): Dimension of the features to normalize.
        eps (float, optional): Epsilon for numerical stability. Defaults to 1e-5.
        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
        device (torch.device, optional): Device for parameters. Defaults to None.
        dtype (torch.dtype, optional): Data type for parameters. Defaults to None.
    """

    def __init__(
        self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None
    ):
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        self.eps = eps
        if dropout_p > 0.0:
            self.drop = torch.nn.Dropout(dropout_p)
        else:
            self.drop = None
        self.weight = torch.nn.Parameter(
            torch.empty(hidden_size, **factory_kwargs)
        )
        self.register_parameter("bias", None)
        self.reset_parameters()


[docs]
    def reset_parameters(self):
        """Initializes the weights of the layer to ones."""
        torch.nn.init.ones_(self.weight)



[docs]
    def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
        """
        Forward pass for RMSNorm.

        Args:
            x (torch.Tensor): Input tensor.
            residual (torch.Tensor, optional): Optional residual connection. Defaults to None.
            prenorm (bool, optional): Whether to return the state before normalization. Defaults to False.
            residual_in_fp32 (bool, optional): Compute residual in float32. Defaults to False.

        Returns:
            torch.Tensor | tuple[torch.Tensor, torch.Tensor]: Normalized output. If ``prenorm=True``, returns ``(out, prenorm_state)``.
        """
        return rms_norm_fn(
            x,
            self.weight,
            self.bias,
            residual=residual,
            eps=self.eps,
            dropout_p=(
                self.drop.p if self.drop is not None and self.training else 0.0
            ),
            prenorm=prenorm,
            residual_in_fp32=residual_in_fp32,
        )





[docs]
class LayerNormLinearFn(torch.autograd.Function):
    """
    Autograd function for a fused Layer/RMS Normalization followed immediately by a Linear projection.
    """

    @staticmethod
    @custom_fwd(device_type="cuda")
    def forward(
        ctx,
        x,
        norm_weight,
        norm_bias,
        linear_weight,
        linear_bias,
        residual=None,
        eps=1e-6,
        prenorm=False,
        residual_in_fp32=False,
        is_rms_norm=False,
    ):
        """
        Forward pass for the LayerNormLinearFn.

        Args:
            ctx (Any): Autograd context.
            x (torch.Tensor): Input tensor.
            norm_weight (torch.Tensor): Normalization weights.
            norm_bias (torch.Tensor | None): Normalization biases.
            linear_weight (torch.Tensor): Linear projection weights.
            linear_bias (torch.Tensor | None): Linear projection biases.
            residual (torch.Tensor, optional): Optional residual tensor. Defaults to None.
            eps (float, optional): Numerical stability epsilon. Defaults to 1e-6.
            prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False.
            residual_in_fp32 (bool, optional): Whether to maintain the residual in FP32. Defaults to False.
            is_rms_norm (bool, optional): If True, uses RMS norm instead of Layer norm. Defaults to False.

        Returns:
            torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The projected output, optionally along with the prenorm residual state.
        """
        x_shape_og = x.shape
        # reshape input data into 2D tensor
        x = x.reshape(-1, x.shape[-1])
        if x.stride(-1) != 1:
            x = x.contiguous()
        if residual is not None:
            assert residual.shape == x_shape_og
            residual = residual.reshape(-1, residual.shape[-1])
            if residual.stride(-1) != 1:
                residual = residual.contiguous()
        norm_weight = norm_weight.contiguous()
        if norm_bias is not None:
            norm_bias = norm_bias.contiguous()
        residual_dtype = (
            residual.dtype
            if residual is not None
            else (torch.float32 if residual_in_fp32 else None)
        )
        y, _, mean, rstd, residual_out, *rest = _layer_norm_fwd(
            x,
            norm_weight,
            norm_bias,
            eps,
            residual,
            out_dtype=(
                None
                if not torch.is_autocast_enabled()
                else torch.get_autocast_gpu_dtype()
            ),
            residual_dtype=residual_dtype,
            is_rms_norm=is_rms_norm,
        )
        y = y.reshape(x_shape_og)
        dtype = (
            torch.get_autocast_gpu_dtype()
            if torch.is_autocast_enabled()
            else y.dtype
        )
        linear_weight = linear_weight.to(dtype)
        linear_bias = (
            linear_bias.to(dtype) if linear_bias is not None else None
        )
        out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
        # We don't store y, will be recomputed in the backward pass to save memory
        ctx.save_for_backward(
            residual_out, norm_weight, norm_bias, linear_weight, mean, rstd
        )
        ctx.x_shape_og = x_shape_og
        ctx.eps = eps
        ctx.is_rms_norm = is_rms_norm
        ctx.has_residual = residual is not None
        ctx.prenorm = prenorm
        ctx.x_dtype = x.dtype
        ctx.linear_bias_is_none = linear_bias is None
        return out if not prenorm else (out, residual_out.reshape(x_shape_og))

    @staticmethod
    @custom_bwd(device_type="cuda")
    def backward(ctx, dout, *args):
        """
        Backward pass for LayerNormLinearFn.

        Args:
            ctx (Any): Autograd context.
            dout (torch.Tensor): Gradient of the output tensor.
            *args: Additional gradients (e.g., dresidual).

        Returns:
            tuple: Gradients with respect to all forward inputs.
        """
        x, norm_weight, norm_bias, linear_weight, mean, rstd = (
            ctx.saved_tensors
        )
        dout = dout.reshape(-1, dout.shape[-1])
        dy = F.linear(dout, linear_weight.t())
        dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
        if dy.stride(-1) != 1:
            dy = dy.contiguous()
        assert dy.shape == x.shape
        if ctx.prenorm:
            dresidual = args[0]
            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
            if dresidual.stride(-1) != 1:
                dresidual = dresidual.contiguous()
            assert dresidual.shape == x.shape
        else:
            dresidual = None
        dx, dnorm_weight, dnorm_bias, dresidual_in, _, _, _, y = (
            _layer_norm_bwd(
                dy,
                x,
                norm_weight,
                norm_bias,
                ctx.eps,
                mean,
                rstd,
                dresidual=dresidual,
                has_residual=ctx.has_residual,
                is_rms_norm=ctx.is_rms_norm,
                x_dtype=ctx.x_dtype,
                recompute_output=True,
            )
        )
        dlinear_weight = torch.einsum("bo,bi->oi", dout, y)
        return (
            dx.reshape(ctx.x_shape_og),
            dnorm_weight,
            dnorm_bias,
            dlinear_weight,
            dlinear_bias,
            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
            None,
            None,
            None,
            None,
        )




[docs]
def layer_norm_linear_fn(
    x,
    norm_weight,
    norm_bias,
    linear_weight,
    linear_bias,
    residual=None,
    eps=1e-6,
    prenorm=False,
    residual_in_fp32=False,
    is_rms_norm=False,
):
    """
    Applies fused Layer/RMS Normalization directly followed by a Linear projection using Triton.

    Args:
        x (torch.Tensor): Input tensor.
        norm_weight (torch.Tensor): Normalization weights.
        norm_bias (torch.Tensor | None): Normalization biases.
        linear_weight (torch.Tensor): Linear projection weights.
        linear_bias (torch.Tensor | None): Linear projection biases.
        residual (torch.Tensor, optional): Optional residual tensor to add before norm. Defaults to None.
        eps (float, optional): Epsilon for numerical stability. Defaults to 1e-6.
        prenorm (bool, optional): Whether to return the pre-normalized (residual) state. Defaults to False.
        residual_in_fp32 (bool, optional): Maintain residual in FP32. Defaults to False.
        is_rms_norm (bool, optional): If True, computes RMS norm instead of Layer norm. Defaults to False.

    Returns:
        torch.Tensor | tuple[torch.Tensor, torch.Tensor]: The projected output. If ``prenorm=True``, returns ``(out, prenorm_state)``.
    """
    return LayerNormLinearFn.apply(
        x,
        norm_weight,
        norm_bias,
        linear_weight,
        linear_bias,
        residual,
        eps,
        prenorm,
        residual_in_fp32,
        is_rms_norm,
    )