Upload fused_linear_diffusion_cross_entropy.py with huggingface_hub
Browse files
fused_linear_diffusion_cross_entropy.py
ADDED
|
@@ -0,0 +1,682 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
# Code adapted from
|
| 4 |
+
# https://github.com/fla-org/flash-linear-attention/blob/main/fla/modules/fused_linear_cross_entropy.py
|
| 5 |
+
# Implementation of element-wise division of cross entropy loss
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# Code adapted from
|
| 9 |
+
# https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/fused_linear_cross_entropy.py
|
| 10 |
+
|
| 11 |
+
from functools import partial
|
| 12 |
+
from typing import Optional, Tuple
|
| 13 |
+
|
| 14 |
+
import torch
|
| 15 |
+
import torch.nn as nn
|
| 16 |
+
import torch.nn.functional as F
|
| 17 |
+
import triton
|
| 18 |
+
import triton.language as tl
|
| 19 |
+
from torch.distributed import DeviceMesh
|
| 20 |
+
from torch.distributed.tensor import DTensor, Replicate, Shard, distribute_module
|
| 21 |
+
from torch.distributed.tensor.parallel import ParallelStyle
|
| 22 |
+
|
| 23 |
+
# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576
|
| 24 |
+
# https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
|
| 25 |
+
# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
|
| 26 |
+
# The optimal maximum block size depends on your hardware, your kernel, and your dtype
|
| 27 |
+
MAX_FUSED_SIZE = 65536 // 2
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@triton.heuristics({
|
| 31 |
+
'HAS_SCALE': lambda args: args['scale'] is not None
|
| 32 |
+
})
|
| 33 |
+
@triton.autotune(
|
| 34 |
+
configs=[
|
| 35 |
+
triton.Config({}, num_warps=num_warps)
|
| 36 |
+
for num_warps in [1, 2, 4, 8, 16, 32]
|
| 37 |
+
],
|
| 38 |
+
key=['D']
|
| 39 |
+
)
|
| 40 |
+
@triton.jit
|
| 41 |
+
def logsumexp_fwd_kernel(
|
| 42 |
+
x,
|
| 43 |
+
z,
|
| 44 |
+
scale,
|
| 45 |
+
D: tl.constexpr,
|
| 46 |
+
B: tl.constexpr,
|
| 47 |
+
HAS_SCALE: tl.constexpr
|
| 48 |
+
):
|
| 49 |
+
i_n, i_d = tl.program_id(0).to(tl.int64), tl.program_id(1).to(tl.int64)
|
| 50 |
+
o_d = i_d * B + tl.arange(0, B)
|
| 51 |
+
m_d = o_d < D
|
| 52 |
+
|
| 53 |
+
b_x = tl.load(x + i_n * D + o_d, mask=m_d, other=-float('inf'))
|
| 54 |
+
if HAS_SCALE:
|
| 55 |
+
b_x = b_x * scale
|
| 56 |
+
b_m = tl.max(b_x, 0)
|
| 57 |
+
b_z = tl.log(tl.sum(tl.exp(b_x - b_m), 0)) + b_m
|
| 58 |
+
tl.store(z + i_n * tl.cdiv(D, B) + i_d, b_z)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def logsumexp_fwd(
|
| 62 |
+
x,
|
| 63 |
+
scale: Optional[float] = None,
|
| 64 |
+
dtype: Optional[torch.dtype] = None
|
| 65 |
+
):
|
| 66 |
+
r"""
|
| 67 |
+
Compute the logsumexp of the input tensor over the last dimension.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
x (Tensor):
|
| 71 |
+
The input tensor of any shape.
|
| 72 |
+
scale (Optional[float]):
|
| 73 |
+
The scale applied to the input tensor. Default: `None`.
|
| 74 |
+
dtype (Optional[torch.dtype]):
|
| 75 |
+
The data type of the output tensor. Default: `None`.
|
| 76 |
+
Returns:
|
| 77 |
+
Tensor: The logsumexp of the input tensor.
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
shape = x.shape
|
| 81 |
+
x = x.view(-1, shape[-1])
|
| 82 |
+
N, D = x.shape
|
| 83 |
+
B = min(triton.next_power_of_2(D), 64 * 1024)
|
| 84 |
+
ND = triton.cdiv(D, B)
|
| 85 |
+
|
| 86 |
+
z = x.new_empty(N, ND, dtype=torch.float)
|
| 87 |
+
logsumexp_fwd_kernel[(N, ND)](
|
| 88 |
+
x=x,
|
| 89 |
+
z=z,
|
| 90 |
+
scale=scale,
|
| 91 |
+
D=D,
|
| 92 |
+
B=B
|
| 93 |
+
)
|
| 94 |
+
z = z.logsumexp(-1).view(*shape[:-1])
|
| 95 |
+
if dtype is not None and dtype != torch.float:
|
| 96 |
+
z = z.to(dtype)
|
| 97 |
+
return z
|
| 98 |
+
|
| 99 |
+
@triton.jit
|
| 100 |
+
def cross_entropy_kernel(
|
| 101 |
+
logits,
|
| 102 |
+
lse,
|
| 103 |
+
target,
|
| 104 |
+
p_mask,
|
| 105 |
+
loss,
|
| 106 |
+
total,
|
| 107 |
+
ignore_index,
|
| 108 |
+
label_smoothing: tl.constexpr,
|
| 109 |
+
logit_scale: tl.constexpr,
|
| 110 |
+
reduction: tl.constexpr,
|
| 111 |
+
V: tl.constexpr,
|
| 112 |
+
BV: tl.constexpr
|
| 113 |
+
):
|
| 114 |
+
"""
|
| 115 |
+
This kernel computes both cross entropy loss and the gradient of the input.
|
| 116 |
+
We only consider hard label + mean reduction for now.
|
| 117 |
+
Please refer to https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for the math.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
logits:
|
| 121 |
+
Pointer to logits tensor.
|
| 122 |
+
lse:
|
| 123 |
+
Pointer to logsumexp tensor.
|
| 124 |
+
target: Pointer to target tensor.
|
| 125 |
+
loss:
|
| 126 |
+
Pointer to tensor to store the loss.
|
| 127 |
+
V (int):
|
| 128 |
+
The number of columns in the input tensor.
|
| 129 |
+
total (int):
|
| 130 |
+
The number of non-ignored classes.
|
| 131 |
+
ignore_index (int):
|
| 132 |
+
The index to ignore in the target.
|
| 133 |
+
label_smoothing (float):
|
| 134 |
+
The amount of smoothing when computing the loss, where 0.0 means no smoothing.
|
| 135 |
+
reduction (str):
|
| 136 |
+
The string for the reduction to apply
|
| 137 |
+
BV (int):
|
| 138 |
+
The block size for vocab.
|
| 139 |
+
"""
|
| 140 |
+
|
| 141 |
+
# https://github.com/triton-lang/triton/issues/1058
|
| 142 |
+
# If B*T*V is too large, i_n * stride will overflow out of int32, so we convert to int64
|
| 143 |
+
i_n = tl.program_id(0).to(tl.int64)
|
| 144 |
+
NV = tl.cdiv(V, BV)
|
| 145 |
+
|
| 146 |
+
# 1. Load target first because if the target is ignore_index, we can return right away
|
| 147 |
+
b_y = tl.load(target + i_n)
|
| 148 |
+
# load p_mask
|
| 149 |
+
b_p_mask = tl.load(p_mask + i_n)
|
| 150 |
+
|
| 151 |
+
# 2. locate the start index
|
| 152 |
+
logits += i_n * V
|
| 153 |
+
|
| 154 |
+
if b_y == ignore_index:
|
| 155 |
+
# set all x as 0
|
| 156 |
+
for i in range(0, V, BV):
|
| 157 |
+
o_v = i + tl.arange(0, BV)
|
| 158 |
+
tl.store(logits + o_v, 0.0, mask=o_v < V)
|
| 159 |
+
return
|
| 160 |
+
|
| 161 |
+
# Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax)
|
| 162 |
+
# Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867
|
| 163 |
+
|
| 164 |
+
# 3. [Online softmax] first pass: compute logsumexp
|
| 165 |
+
# we did this in anouter kernel
|
| 166 |
+
b_l = tl.load(logits + b_y) * logit_scale
|
| 167 |
+
b_lse = tl.load(lse + i_n)
|
| 168 |
+
|
| 169 |
+
# 4. Calculate the loss
|
| 170 |
+
# loss = lse - logits_l
|
| 171 |
+
# celoss = -log(q_y) = -log(softmax(x_y))
|
| 172 |
+
b_loss = (b_lse - b_l) / b_p_mask # Diffusion Scaled '1/t'
|
| 173 |
+
|
| 174 |
+
# Label smoothing is a general case of normal cross entropy
|
| 175 |
+
# See the full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issue-2503665310
|
| 176 |
+
b_z = 0.0
|
| 177 |
+
eps = label_smoothing / V
|
| 178 |
+
|
| 179 |
+
# We need tl.debug_barrier() as mentioned in
|
| 180 |
+
# https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
|
| 181 |
+
tl.debug_barrier()
|
| 182 |
+
|
| 183 |
+
# 5. [Online Softmax] Second pass: compute gradients
|
| 184 |
+
# For 'mean' reduction, gradients are normalized by number of non-ignored elements
|
| 185 |
+
# dx_y = (softmax(x_y) - 1) / N
|
| 186 |
+
# dx_i = softmax(x_i) / N, i != y
|
| 187 |
+
# For label smoothing:
|
| 188 |
+
# dx_i = (softmax(x_y) - label_smoothing / V) / N, i != y
|
| 189 |
+
# dx_y = (softmax(x_y) - label_smoothing / V - (1 - label_smoothing)) / N
|
| 190 |
+
# = dx_i - (1 - label_smoothing) / N
|
| 191 |
+
for iv in range(0, NV):
|
| 192 |
+
o_v = iv * BV + tl.arange(0, BV)
|
| 193 |
+
b_logits = tl.load(logits + o_v, mask=o_v < V, other=float('-inf')) * logit_scale
|
| 194 |
+
if label_smoothing > 0:
|
| 195 |
+
# scale X beforehand to avoid overflow
|
| 196 |
+
b_z += tl.sum(tl.where(o_v < V, -eps * b_logits, 0.0))
|
| 197 |
+
b_p = (tl.exp(b_logits - b_lse) - eps) * logit_scale
|
| 198 |
+
b_p /= b_p_mask # 修改
|
| 199 |
+
if reduction == "mean":
|
| 200 |
+
b_p = b_p / total
|
| 201 |
+
tl.store(logits + o_v, b_p, mask=o_v < V)
|
| 202 |
+
|
| 203 |
+
tl.debug_barrier()
|
| 204 |
+
|
| 205 |
+
# Orginal loss = H(q, p), with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps
|
| 206 |
+
# H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p)
|
| 207 |
+
# = (1 - label_smoothing) * H(q, p) + eps * sum(logsoftmax(x_i))
|
| 208 |
+
# By using m (global max of xi) and d (sum of e^(xi-m)), we can simplify as:
|
| 209 |
+
# = (1 - label_smoothing) * H(q, p) + (-sum(x_i * eps) + label_smoothing * (m + logd))
|
| 210 |
+
# Refer to H(q', p) in section 7 of the paper:
|
| 211 |
+
# https://arxiv.org/pdf/1512.00567
|
| 212 |
+
# pytorch:
|
| 213 |
+
# https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516
|
| 214 |
+
# See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087
|
| 215 |
+
if label_smoothing > 0:
|
| 216 |
+
b_loss = b_loss * (1 - label_smoothing) + (b_z + label_smoothing * b_lse)
|
| 217 |
+
|
| 218 |
+
# 6. Specially handle the i==y case where `dx_y = (softmax(x_y) - (1 - label_smoothing) / N`
|
| 219 |
+
b_l = tl.load(logits + b_y)
|
| 220 |
+
|
| 221 |
+
# Normalize the loss by the number of non-ignored elements if reduction is "mean"
|
| 222 |
+
if reduction == 'mean':
|
| 223 |
+
b_loss = b_loss / total
|
| 224 |
+
# b_l += (label_smoothing - 1) / total * logit_scale
|
| 225 |
+
# b_l has already been divided by b_p_mask and total
|
| 226 |
+
b_l += (label_smoothing - 1) / b_p_mask / total * logit_scale
|
| 227 |
+
else:
|
| 228 |
+
# b_l += (label_smoothing - 1) * logit_scale
|
| 229 |
+
b_l += (label_smoothing - 1) / b_p_mask * logit_scale
|
| 230 |
+
|
| 231 |
+
tl.store(loss + i_n, b_loss)
|
| 232 |
+
tl.store(logits + b_y, b_l)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
@triton.jit
|
| 236 |
+
def elementwise_mul_kernel(
|
| 237 |
+
x,
|
| 238 |
+
g,
|
| 239 |
+
N: tl.constexpr,
|
| 240 |
+
B: tl.constexpr
|
| 241 |
+
):
|
| 242 |
+
"""
|
| 243 |
+
This function multiplies each element of the tensor pointed by x with the value pointed by g.
|
| 244 |
+
The multiplication is performed in-place on the tensor pointed by x.
|
| 245 |
+
|
| 246 |
+
Parameters:
|
| 247 |
+
x:
|
| 248 |
+
Pointer to the input tensor.
|
| 249 |
+
g:
|
| 250 |
+
Pointer to the gradient output value.
|
| 251 |
+
N (int):
|
| 252 |
+
The number of columns in the input tensor.
|
| 253 |
+
B (int):
|
| 254 |
+
The block size for Triton operations.
|
| 255 |
+
"""
|
| 256 |
+
|
| 257 |
+
# Get the program ID and convert it to int64 to avoid overflow
|
| 258 |
+
i_x = tl.program_id(0).to(tl.int64)
|
| 259 |
+
o_x = i_x * B + tl.arange(0, B)
|
| 260 |
+
|
| 261 |
+
# Load the gradient output value
|
| 262 |
+
b_g = tl.load(g)
|
| 263 |
+
b_x = tl.load(x + o_x, mask=o_x < N)
|
| 264 |
+
tl.store(x + o_x, b_x * b_g, mask=o_x < N)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def fused_linear_cross_entropy_forward(
|
| 268 |
+
x: torch.Tensor,
|
| 269 |
+
target: torch.LongTensor,
|
| 270 |
+
weight: torch.Tensor,
|
| 271 |
+
bias: torch.Tensor = None,
|
| 272 |
+
p_mask: torch.Tensor = None,
|
| 273 |
+
ignore_index: int = -100,
|
| 274 |
+
label_smoothing: float = 0.0,
|
| 275 |
+
logit_scale: float = 1.0,
|
| 276 |
+
num_chunks: int = 8,
|
| 277 |
+
reduction: str = "mean"
|
| 278 |
+
):
|
| 279 |
+
device = x.device
|
| 280 |
+
# inputs have shape: [N, H]
|
| 281 |
+
# materialized activations will have shape: [N, V]
|
| 282 |
+
# the increase in memory = [N, V]
|
| 283 |
+
# reduction can be achieved by partitioning the number of tokens N into smaller chunks.
|
| 284 |
+
|
| 285 |
+
# ideally, we would like to achieve the same memory consumption as [N, H],
|
| 286 |
+
# so the expected chunk size should be:
|
| 287 |
+
# NC = ceil(V / H)
|
| 288 |
+
# C = ceil(N / NC)
|
| 289 |
+
# for ex: N = 4096*4, V = 32000, H = 4096 ==> NC = 8, C = ceil(N / NC) = 2048
|
| 290 |
+
N, H, V = *x.shape, weight.shape[0]
|
| 291 |
+
BV = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
|
| 292 |
+
# TODO: in real cases, we may need to limit the number of chunks NC to
|
| 293 |
+
# ensure the precisions of accumulated gradients
|
| 294 |
+
NC = min(num_chunks, triton.cdiv(V, H))
|
| 295 |
+
C = triton.next_power_of_2(triton.cdiv(N, NC))
|
| 296 |
+
NC = triton.cdiv(N, C)
|
| 297 |
+
|
| 298 |
+
# [N, H]
|
| 299 |
+
dx = torch.zeros_like(x, device=device)
|
| 300 |
+
# [V, H]
|
| 301 |
+
dw = torch.zeros_like(weight, device=device, dtype=torch.float) if weight is not None else None
|
| 302 |
+
# [V]
|
| 303 |
+
db = torch.zeros_like(bias, device=device, dtype=torch.float) if bias is not None else None
|
| 304 |
+
# [N]
|
| 305 |
+
loss = torch.zeros(N, device=device, dtype=torch.float)
|
| 306 |
+
|
| 307 |
+
total = target.ne(ignore_index).sum().item()
|
| 308 |
+
|
| 309 |
+
for ic in range(NC):
|
| 310 |
+
start, end = ic * C, min((ic + 1) * C, N)
|
| 311 |
+
# [C, N]
|
| 312 |
+
c_x = x[start:end]
|
| 313 |
+
# when doing matmul, use the original precision
|
| 314 |
+
# [C, V]
|
| 315 |
+
c_logits = F.linear(c_x, weight, bias)
|
| 316 |
+
c_target = target[start:end]
|
| 317 |
+
c_p_mask = p_mask[start:end]
|
| 318 |
+
# [C]
|
| 319 |
+
# keep lse in fp32 to maintain precision
|
| 320 |
+
c_lse = logsumexp_fwd(c_logits, scale=logit_scale, dtype=torch.float)
|
| 321 |
+
|
| 322 |
+
# unreduced loss
|
| 323 |
+
c_loss = loss[start:end]
|
| 324 |
+
|
| 325 |
+
# Here we calculate the gradient of c_logits in place so we can save memory.
|
| 326 |
+
cross_entropy_kernel[(c_logits.shape[0],)](
|
| 327 |
+
logits=c_logits,
|
| 328 |
+
lse=c_lse,
|
| 329 |
+
target=c_target,
|
| 330 |
+
p_mask=c_p_mask,
|
| 331 |
+
loss=c_loss,
|
| 332 |
+
total=total,
|
| 333 |
+
ignore_index=ignore_index,
|
| 334 |
+
label_smoothing=label_smoothing,
|
| 335 |
+
logit_scale=logit_scale,
|
| 336 |
+
reduction=reduction,
|
| 337 |
+
V=V,
|
| 338 |
+
BV=BV,
|
| 339 |
+
num_warps=32
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
# gradient of logits is computed in-place by the above triton kernel and is of shape: C x V
|
| 343 |
+
# thus dx should be of shape: C x H
|
| 344 |
+
dx[start:end] = torch.mm(c_logits, weight)
|
| 345 |
+
|
| 346 |
+
# keep dw in fp32 to maintain precision
|
| 347 |
+
if weight is not None:
|
| 348 |
+
dw += c_logits.t() @ c_x
|
| 349 |
+
|
| 350 |
+
if bias is not None:
|
| 351 |
+
torch.add(input=db, other=c_logits.sum(0), out=db)
|
| 352 |
+
|
| 353 |
+
loss = loss.sum()
|
| 354 |
+
if dw is not None:
|
| 355 |
+
dw = dw.to(weight)
|
| 356 |
+
if db is not None:
|
| 357 |
+
db = db.to(bias)
|
| 358 |
+
return loss, dx, dw, db
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def fused_linear_cross_entropy_backward(
|
| 362 |
+
do: torch.Tensor,
|
| 363 |
+
dx: torch.Tensor,
|
| 364 |
+
dw: torch.Tensor,
|
| 365 |
+
db: torch.Tensor
|
| 366 |
+
):
|
| 367 |
+
# If cross entropy is the last layer, do is 1.0. Skip the mul to save time
|
| 368 |
+
if torch.ne(do, torch.tensor(1.0, device=do.device)):
|
| 369 |
+
# We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
|
| 370 |
+
# for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
|
| 371 |
+
N, H = dx.shape
|
| 372 |
+
B = min(MAX_FUSED_SIZE, triton.next_power_of_2(H))
|
| 373 |
+
|
| 374 |
+
elementwise_mul_kernel[(triton.cdiv(N * H, B),)](
|
| 375 |
+
x=dx,
|
| 376 |
+
g=do,
|
| 377 |
+
N=N*H,
|
| 378 |
+
B=B,
|
| 379 |
+
num_warps=32,
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
# handle dw
|
| 383 |
+
if dw is not None:
|
| 384 |
+
V, H = dw.shape
|
| 385 |
+
elementwise_mul_kernel[(triton.cdiv(V * H, B),)](
|
| 386 |
+
x=dw,
|
| 387 |
+
g=do,
|
| 388 |
+
N=V*H,
|
| 389 |
+
B=B,
|
| 390 |
+
num_warps=32,
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
if db is not None:
|
| 394 |
+
V = db.shape[0]
|
| 395 |
+
elementwise_mul_kernel[(triton.cdiv(V, B),)](
|
| 396 |
+
x=db,
|
| 397 |
+
g=do,
|
| 398 |
+
N=V,
|
| 399 |
+
B=B,
|
| 400 |
+
num_warps=32,
|
| 401 |
+
)
|
| 402 |
+
return dx, dw, db
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
class FusedLinearCrossEntropyFunction(torch.autograd.Function):
|
| 406 |
+
|
| 407 |
+
@staticmethod
|
| 408 |
+
def forward(
|
| 409 |
+
ctx,
|
| 410 |
+
x: torch.Tensor,
|
| 411 |
+
target: torch.LongTensor,
|
| 412 |
+
weight: torch.Tensor,
|
| 413 |
+
bias: torch.Tensor = None,
|
| 414 |
+
p_mask: torch.Tensor = None,
|
| 415 |
+
ignore_index: int = -100,
|
| 416 |
+
label_smoothing: float = 0.0,
|
| 417 |
+
logit_scale: float = 1.0,
|
| 418 |
+
num_chunks: int = 8,
|
| 419 |
+
reduction: str = "mean"
|
| 420 |
+
):
|
| 421 |
+
"""
|
| 422 |
+
Fusing the last linear layer with cross-entropy loss
|
| 423 |
+
Reference: https://github.com/mgmalek/efficient_cross_entropy
|
| 424 |
+
|
| 425 |
+
Handle the forward and backward pass of the final linear layer via cross-entropy loss by avoiding
|
| 426 |
+
the materialization of the large logits tensor. Since Cross Entropy Loss is the last layer, we can
|
| 427 |
+
compute the gradient at the forward pass. By doing so, we don't have to store the x and target
|
| 428 |
+
for the backward pass.
|
| 429 |
+
|
| 430 |
+
x (torch.Tensor): [batch_size * seq_len, hidden_size]
|
| 431 |
+
target (torch.LongTensor): [batch_size * seq_len]
|
| 432 |
+
where each value is in [0, vocab_size).
|
| 433 |
+
weight (torch.Tensor): [vocab_size, hidden_size]
|
| 434 |
+
where `vocab_size` is the number of classes.
|
| 435 |
+
bias (Optional[torch.Tensor]): [vocab_size]
|
| 436 |
+
where `vocab_size` is the number of classes.
|
| 437 |
+
p_mask(torch.Tensor): [batch_size * seq_len]
|
| 438 |
+
Its shape should be same as target.
|
| 439 |
+
ignore_index:
|
| 440 |
+
the index to ignore in the target.
|
| 441 |
+
label_smoothing:
|
| 442 |
+
the amount of smoothing when computing the loss, where 0.0 means no smoothing.
|
| 443 |
+
logit_scale: float = 1.0,
|
| 444 |
+
A scaling factor applied to the logits. Default: 1.0
|
| 445 |
+
num_chunks: int
|
| 446 |
+
The number of chunks to split the input tensor into for processing.
|
| 447 |
+
This can help optimize memory usage and computation speed.
|
| 448 |
+
Default: 8
|
| 449 |
+
reduction:
|
| 450 |
+
Specifies the reduction to apply to the output: 'mean' | 'sum'.
|
| 451 |
+
'mean': the weighted mean of the output is taken,
|
| 452 |
+
'sum': the output will be summed.
|
| 453 |
+
Default: 'mean'.
|
| 454 |
+
"""
|
| 455 |
+
loss, dx, dw, db = fused_linear_cross_entropy_forward(
|
| 456 |
+
x,
|
| 457 |
+
target,
|
| 458 |
+
weight,
|
| 459 |
+
bias,
|
| 460 |
+
p_mask,
|
| 461 |
+
ignore_index,
|
| 462 |
+
label_smoothing,
|
| 463 |
+
logit_scale,
|
| 464 |
+
num_chunks,
|
| 465 |
+
reduction
|
| 466 |
+
)
|
| 467 |
+
# downcast to dtype and store for backward
|
| 468 |
+
ctx.save_for_backward(
|
| 469 |
+
dx.detach(),
|
| 470 |
+
dw.detach() if weight is not None else None,
|
| 471 |
+
db.detach() if bias is not None else None,
|
| 472 |
+
)
|
| 473 |
+
return loss
|
| 474 |
+
|
| 475 |
+
@staticmethod
|
| 476 |
+
def backward(ctx, do):
|
| 477 |
+
dx, dw, db = ctx.saved_tensors
|
| 478 |
+
dx, dw, db = fused_linear_cross_entropy_backward(do, dx, dw, db)
|
| 479 |
+
# 10 gradients should be returned, with `p_mask` having no grads
|
| 480 |
+
# Check the number of arguments in the `forward` method
|
| 481 |
+
return dx, None, dw, db, None, None, None, None, None, None
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
def fused_linear_cross_entropy_loss(
|
| 485 |
+
x: torch.Tensor,
|
| 486 |
+
target: torch.LongTensor,
|
| 487 |
+
weight: torch.Tensor,
|
| 488 |
+
bias: torch.Tensor = None,
|
| 489 |
+
p_mask: torch.Tensor = None,
|
| 490 |
+
ignore_index: int = -100,
|
| 491 |
+
label_smoothing: float = 0.0,
|
| 492 |
+
logit_scale: float = 1.0,
|
| 493 |
+
num_chunks: int = 8,
|
| 494 |
+
reduction: str = "mean"
|
| 495 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 496 |
+
"""
|
| 497 |
+
Args:
|
| 498 |
+
x (torch.Tensor): [batch_size * seq_len, hidden_size]
|
| 499 |
+
target (torch.LongTensor): [batch_size * seq_len]
|
| 500 |
+
where each value is in [0, vocab_size).
|
| 501 |
+
weight (torch.Tensor): [vocab_size, hidden_size]
|
| 502 |
+
where `vocab_size` is the number of classes.
|
| 503 |
+
bias (Optional[torch.Tensor]): [vocab_size]
|
| 504 |
+
where `vocab_size` is the number of classes.
|
| 505 |
+
p_mask(torch.Tensor): [batch_size * seq_len]
|
| 506 |
+
Its shape should be same as target.
|
| 507 |
+
ignore_index: int.
|
| 508 |
+
If target == ignore_index, the loss is set to 0.0.
|
| 509 |
+
label_smoothing: float
|
| 510 |
+
logit_scale: float
|
| 511 |
+
A scaling factor applied to the logits. Default: 1.0
|
| 512 |
+
num_chunks: int
|
| 513 |
+
The number of chunks to split the input tensor into for processing.
|
| 514 |
+
This can help optimize memory usage and computation speed.
|
| 515 |
+
Default: 8
|
| 516 |
+
reduction:
|
| 517 |
+
Specifies the reduction to apply to the output: 'mean' | 'sum'.
|
| 518 |
+
'mean': the weighted mean of the output is taken,
|
| 519 |
+
'sum': the output will be summed.
|
| 520 |
+
Default: 'mean'.
|
| 521 |
+
Returns:
|
| 522 |
+
losses: [batch,], float
|
| 523 |
+
"""
|
| 524 |
+
return FusedLinearCrossEntropyFunction.apply(
|
| 525 |
+
x,
|
| 526 |
+
target,
|
| 527 |
+
weight,
|
| 528 |
+
bias,
|
| 529 |
+
p_mask,
|
| 530 |
+
ignore_index,
|
| 531 |
+
label_smoothing,
|
| 532 |
+
logit_scale,
|
| 533 |
+
num_chunks,
|
| 534 |
+
reduction
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
class FusedLinearDiffusionCrossEntropyLoss(nn.Module):
|
| 539 |
+
|
| 540 |
+
def __init__(
|
| 541 |
+
self,
|
| 542 |
+
ignore_index: int = -100,
|
| 543 |
+
label_smoothing: float = 0.0,
|
| 544 |
+
logit_scale: float = 1.0,
|
| 545 |
+
num_chunks: int = 8,
|
| 546 |
+
reduction: str = "mean"
|
| 547 |
+
):
|
| 548 |
+
"""
|
| 549 |
+
Args:
|
| 550 |
+
ignore_index: int.
|
| 551 |
+
If target == ignore_index, the loss is set to 0.0.
|
| 552 |
+
label_smoothing: float
|
| 553 |
+
logit_scale: float
|
| 554 |
+
A scaling factor applied to the logits. Default: 1.0
|
| 555 |
+
num_chunks: int
|
| 556 |
+
The number of chunks to split the input tensor into for processing.
|
| 557 |
+
This can help optimize memory usage and computation speed.
|
| 558 |
+
Default: 8
|
| 559 |
+
reduction:
|
| 560 |
+
Specifies the reduction to apply to the output: 'mean' | 'sum'.
|
| 561 |
+
'mean': the weighted mean of the output is taken,
|
| 562 |
+
'sum': the output will be summed.
|
| 563 |
+
Default: 'mean'.
|
| 564 |
+
"""
|
| 565 |
+
super().__init__()
|
| 566 |
+
|
| 567 |
+
assert reduction in ["mean", "sum"], f"reduction: {reduction} is not supported"
|
| 568 |
+
|
| 569 |
+
self.ignore_index = ignore_index
|
| 570 |
+
self.label_smoothing = label_smoothing
|
| 571 |
+
self.logit_scale = logit_scale
|
| 572 |
+
self.num_chunks = num_chunks
|
| 573 |
+
self.reduction = reduction
|
| 574 |
+
|
| 575 |
+
@torch.compiler.disable
|
| 576 |
+
def forward(
|
| 577 |
+
self,
|
| 578 |
+
x: torch.Tensor,
|
| 579 |
+
target: torch.LongTensor,
|
| 580 |
+
weight: torch.Tensor,
|
| 581 |
+
bias: Optional[torch.Tensor] = None,
|
| 582 |
+
p_mask: torch.Tensor = None
|
| 583 |
+
):
|
| 584 |
+
"""
|
| 585 |
+
Args:
|
| 586 |
+
x (torch.Tensor): [batch_size, seq_len, hidden_size]
|
| 587 |
+
target (torch.LongTensor): [batch_size, seq_len]
|
| 588 |
+
where each value is in [0, V).
|
| 589 |
+
weight (torch.Tensor): [vocab_size, hidden_size]
|
| 590 |
+
where `vocab_size` is the number of classes.
|
| 591 |
+
bias (Optional[torch.Tensor]): [vocab_size]
|
| 592 |
+
where `vocab_size` is the number of classes.
|
| 593 |
+
p_mask(torch.Tensor): [batch_size, seq_len]
|
| 594 |
+
Its shape is same as target.
|
| 595 |
+
Shape: (1, packed_length) when varlen attn is used.
|
| 596 |
+
Returns:
|
| 597 |
+
loss
|
| 598 |
+
|
| 599 |
+
TODO:
|
| 600 |
+
follow https://github.com/ML-GSAI/LLaDA/blob/main/GUIDELINES.md#pre-training
|
| 601 |
+
```py
|
| 602 |
+
unreduced_loss /= p_mask
|
| 603 |
+
```
|
| 604 |
+
Scale the values of `unreduced_loss at different positions
|
| 605 |
+
"""
|
| 606 |
+
if p_mask is None:
|
| 607 |
+
p_mask = torch.ones_like(target, dtype=torch.float, device=x.device)
|
| 608 |
+
|
| 609 |
+
x = x.contiguous().view(-1, x.shape[-1])
|
| 610 |
+
target = target.contiguous().view(-1)
|
| 611 |
+
weight = weight.contiguous()
|
| 612 |
+
bias = bias.contiguous() if bias else None
|
| 613 |
+
p_mask = p_mask.contiguous().view(-1)
|
| 614 |
+
l, d = x.shape
|
| 615 |
+
assert l == target.shape[0] == p_mask.shape[0], f"{x.shape=}, {target.shape=}, {p_mask.shape=}"
|
| 616 |
+
|
| 617 |
+
loss = fused_linear_cross_entropy_loss(
|
| 618 |
+
x,
|
| 619 |
+
target,
|
| 620 |
+
weight=weight,
|
| 621 |
+
bias=bias,
|
| 622 |
+
p_mask=p_mask,
|
| 623 |
+
ignore_index=self.ignore_index,
|
| 624 |
+
label_smoothing=self.label_smoothing,
|
| 625 |
+
logit_scale=self.logit_scale,
|
| 626 |
+
num_chunks=self.num_chunks,
|
| 627 |
+
reduction=self.reduction
|
| 628 |
+
)
|
| 629 |
+
return loss
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
class LinearLossParallel(ParallelStyle):
|
| 633 |
+
def __init__(
|
| 634 |
+
self,
|
| 635 |
+
*,
|
| 636 |
+
sequence_dim: int = 1,
|
| 637 |
+
use_local_output: bool = False,
|
| 638 |
+
):
|
| 639 |
+
super().__init__()
|
| 640 |
+
|
| 641 |
+
self.sequence_sharding = (Shard(sequence_dim),)
|
| 642 |
+
self.use_local_output = use_local_output
|
| 643 |
+
|
| 644 |
+
@staticmethod
|
| 645 |
+
def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
|
| 646 |
+
x, target, weight, bias = inputs
|
| 647 |
+
|
| 648 |
+
if not isinstance(x, DTensor):
|
| 649 |
+
# assume the input passed in already sharded on the sequence dim and create the DTensor
|
| 650 |
+
x = DTensor.from_local(x, device_mesh, sequence_sharding)
|
| 651 |
+
if x.placements != sequence_sharding:
|
| 652 |
+
x = x.redistribute(placements=sequence_sharding, async_op=True)
|
| 653 |
+
if not isinstance(target, DTensor):
|
| 654 |
+
target = DTensor.from_local(target, device_mesh, [Replicate()])
|
| 655 |
+
if target.placements != sequence_sharding:
|
| 656 |
+
target = target.redistribute(placements=sequence_sharding, async_op=True)
|
| 657 |
+
|
| 658 |
+
if not isinstance(weight, DTensor):
|
| 659 |
+
weight = DTensor.from_local(weight, device_mesh, [Replicate()])
|
| 660 |
+
if weight.placements != [Replicate()]:
|
| 661 |
+
# we replicate the weight/bias in FLCE
|
| 662 |
+
weight = weight.redistribute(placements=[Replicate()], async_op=True)
|
| 663 |
+
|
| 664 |
+
if bias is not None and not isinstance(bias, DTensor):
|
| 665 |
+
bias = DTensor.from_local(bias, device_mesh, [Replicate()])
|
| 666 |
+
if bias is not None and bias.placements != [Replicate()]:
|
| 667 |
+
bias = bias.redistribute(placements=[Replicate()], async_op=True)
|
| 668 |
+
|
| 669 |
+
return x.to_local(), target.to_local(), weight.to_local(), bias.to_local() if bias is not None else bias
|
| 670 |
+
|
| 671 |
+
@staticmethod
|
| 672 |
+
def _prepare_output_fn(use_local_output, mod, outputs, device_mesh):
|
| 673 |
+
return outputs.to_local() if use_local_output else outputs
|
| 674 |
+
|
| 675 |
+
def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
|
| 676 |
+
return distribute_module(
|
| 677 |
+
module,
|
| 678 |
+
device_mesh,
|
| 679 |
+
partition_fn=None,
|
| 680 |
+
input_fn=partial(self._prepare_input_fn, self.sequence_sharding),
|
| 681 |
+
output_fn=partial(self._prepare_output_fn, self.use_local_output)
|
| 682 |
+
)
|