BiliSakura commited on Apr 3

Commit

a7134ff

verified ·

1 Parent(s): ef71abe

Add files using upload-large-folder tool

Browse files

Files changed (28) hide show

README.md +1 -0
eupe/__init__.py +1 -0
eupe/__pycache__/__init__.cpython-312.pyc +0 -0
eupe/layers/__init__.py +8 -0
eupe/layers/__pycache__/__init__.cpython-312.pyc +0 -0
eupe/layers/__pycache__/attention.cpython-312.pyc +0 -0
eupe/layers/__pycache__/block.cpython-312.pyc +0 -0
eupe/layers/__pycache__/ffn_layers.cpython-312.pyc +0 -0
eupe/layers/__pycache__/layer_scale.cpython-312.pyc +0 -0
eupe/layers/__pycache__/patch_embed.cpython-312.pyc +0 -0
eupe/layers/__pycache__/rms_norm.cpython-312.pyc +0 -0
eupe/layers/__pycache__/rope_position_encoding.cpython-312.pyc +0 -0
eupe/layers/attention.py +153 -0
eupe/layers/block.py +249 -0
eupe/layers/ffn_layers.py +73 -0
eupe/layers/layer_scale.py +25 -0
eupe/layers/patch_embed.py +73 -0
eupe/layers/rms_norm.py +20 -0
eupe/layers/rope_position_encoding.py +108 -0
eupe/models/__init__.py +2 -0
eupe/models/__pycache__/__init__.cpython-312.pyc +0 -0
eupe/models/__pycache__/vision_transformer.cpython-312.pyc +0 -0
eupe/models/vision_transformer.py +318 -0
eupe/utils/__init__.py +2 -0
eupe/utils/__pycache__/__init__.cpython-312.pyc +0 -0
eupe/utils/__pycache__/utils.cpython-312.pyc +0 -0
eupe/utils/utils.py +51 -0
transformers_eupe.py +1 -0

README.md CHANGED Viewed

@@ -34,6 +34,7 @@ This repository contains a converted EUPE checkpoint (from the original Facebook
 - `config.json`: architecture/config parameters
 - `preprocessor_config.json`: image preprocessing setup
 - `transformers_eupe.py`: local EUPE Transformers registration wrapper
 ## Preprocessing

 - `config.json`: architecture/config parameters
 - `preprocessor_config.json`: image preprocessing setup
 - `transformers_eupe.py`: local EUPE Transformers registration wrapper
+- `eupe/`: vendored EUPE model implementation used by `transformers_eupe.py`
 ## Preprocessing

eupe/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Local EUPE package vendored for standalone model loading.

eupe/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (172 Bytes). View file

eupe/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .attention import CausalSelfAttention, LinearKMaskedBias, SelfAttention
+from .block import CausalSelfAttentionBlock, SelfAttentionBlock
+from .ffn_layers import Mlp, SwiGLUFFN
+from .layer_scale import LayerScale
+from .patch_embed import PatchEmbed
+from .rms_norm import RMSNorm
+from .rope_position_encoding import RopePositionEmbedding

eupe/layers/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (637 Bytes). View file

eupe/layers/__pycache__/attention.cpython-312.pyc ADDED Viewed

Binary file (10.7 kB). View file

eupe/layers/__pycache__/block.cpython-312.pyc ADDED Viewed

Binary file (12.3 kB). View file

eupe/layers/__pycache__/ffn_layers.cpython-312.pyc ADDED Viewed

Binary file (4.36 kB). View file

eupe/layers/__pycache__/layer_scale.cpython-312.pyc ADDED Viewed

Binary file (1.73 kB). View file

eupe/layers/__pycache__/patch_embed.cpython-312.pyc ADDED Viewed

Binary file (4.22 kB). View file

eupe/layers/__pycache__/rms_norm.cpython-312.pyc ADDED Viewed

Binary file (1.92 kB). View file

eupe/layers/__pycache__/rope_position_encoding.cpython-312.pyc ADDED Viewed

Binary file (6.14 kB). View file

eupe/layers/attention.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import math
+from typing import List, Tuple
+import torch
+import torch.nn.functional as F
+from eupe.utils import cat_keep_shapes, uncat_with_shapes
+from torch import Tensor, nn
+def rope_rotate_half(x: Tensor) -> Tensor:
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat([-x2, x1], dim=-1)
+def rope_apply(x: Tensor, sin: Tensor, cos: Tensor) -> Tensor:
+    return (x * cos) + (rope_rotate_half(x) * sin)
+class LinearKMaskedBias(nn.Linear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        o = self.out_features
+        assert o % 3 == 0
+        if self.bias is not None:
+            self.register_buffer("bias_mask", torch.full_like(self.bias, fill_value=math.nan))
+    def forward(self, input: Tensor) -> Tensor:
+        masked_bias = self.bias * self.bias_mask.to(self.bias.dtype) if self.bias is not None else None
+        return F.linear(input, self.weight, masked_bias)
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        mask_k_bias: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        linear_class = LinearKMaskedBias if mask_k_bias else nn.Linear
+        self.qkv = linear_class(dim, dim * 3, bias=qkv_bias, device=device)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias, device=device)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def apply_rope(self, q: Tensor, k: Tensor, rope: Tensor | Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tensor]:
+        q_dtype = q.dtype
+        k_dtype = k.dtype
+        sin, cos = rope
+        rope_dtype = sin.dtype
+        q = q.to(dtype=rope_dtype)
+        k = k.to(dtype=rope_dtype)
+        N = q.shape[-2]
+        prefix = N - sin.shape[-2]
+        assert prefix >= 0
+        q_prefix = q[:, :, :prefix, :]
+        q = rope_apply(q[:, :, prefix:, :], sin, cos)
+        q = torch.cat((q_prefix, q), dim=-2)
+        k_prefix = k[:, :, :prefix, :]
+        k = rope_apply(k[:, :, prefix:, :], sin, cos)
+        k = torch.cat((k_prefix, k), dim=-2)
+        q = q.to(dtype=q_dtype)
+        k = k.to(dtype=k_dtype)
+        return q, k
+    def forward(self, x: Tensor, attn_bias=None, rope: Tensor = None) -> Tensor:
+        qkv = self.qkv(x)
+        attn_v = self.compute_attention(qkv=qkv, attn_bias=attn_bias, rope=rope)
+        x = self.proj(attn_v)
+        x = self.proj_drop(x)
+        return x
+    def forward_list(self, x_list, attn_bias=None, rope_list=None) -> List[Tensor]:
+        assert len(x_list) == len(rope_list)
+        x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
+        qkv_flat = self.qkv(x_flat)
+        qkv_list = uncat_with_shapes(qkv_flat, shapes, num_tokens)
+        att_out = []
+        for _, (qkv, _, rope) in enumerate(zip(qkv_list, shapes, rope_list)):
+            att_out.append(self.compute_attention(qkv, attn_bias=attn_bias, rope=rope))
+        x_flat, shapes, num_tokens = cat_keep_shapes(att_out)
+        x_flat = self.proj(x_flat)
+        return uncat_with_shapes(x_flat, shapes, num_tokens)
+    def compute_attention(self, qkv: Tensor, attn_bias=None, rope=None) -> Tensor:
+        assert attn_bias is None
+        B, N, _ = qkv.shape
+        C = self.qkv.in_features
+        qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = torch.unbind(qkv, 2)
+        q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
+        if rope is not None:
+            q, k = self.apply_rope(q, k, rope)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2)
+        return x.reshape([B, N, C])
+class CausalSelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def init_weights(
+        self, init_attn_std: float | None = None, init_proj_std: float | None = None, factor: float = 1.0
+    ) -> None:
+        init_attn_std = init_attn_std or (self.dim**-0.5)
+        init_proj_std = init_proj_std or init_attn_std * factor
+        nn.init.normal_(self.qkv.weight, std=init_attn_std)
+        nn.init.normal_(self.proj.weight, std=init_proj_std)
+        if self.qkv.bias is not None:
+            nn.init.zeros_(self.qkv.bias)
+        if self.proj.bias is not None:
+            nn.init.zeros_(self.proj.bias)
+    def forward(self, x: Tensor, is_causal: bool = True) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = torch.unbind(qkv, 2)
+        q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=None, dropout_p=self.attn_drop if self.training else 0, is_causal=is_causal
+        )
+        x = x.transpose(1, 2).contiguous().view(B, N, C)
+        x = self.proj_drop(self.proj(x))
+        return x

eupe/layers/block.py ADDED Viewed

	@@ -0,0 +1,249 @@

+from typing import Callable, List, Optional
+import torch
+from torch import Tensor, nn
+from eupe.utils import cat_keep_shapes, uncat_with_shapes
+from .attention import CausalSelfAttention, SelfAttention
+from .ffn_layers import Mlp
+from .layer_scale import LayerScale
+torch._dynamo.config.automatic_dynamic_shapes = False
+torch._dynamo.config.accumulated_cache_size_limit = 1024
+class SelfAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        ffn_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = SelfAttention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        mask_k_bias: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            mask_k_bias=mask_k_bias,
+            device=device,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values, device=device) if init_values else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * ffn_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+            device=device,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values, device=device) if init_values else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    @staticmethod
+    def _maybe_index_rope(rope: tuple[Tensor, Tensor] | None, indices: Tensor) -> tuple[Tensor, Tensor] | None:
+        if rope is None:
+            return None
+        sin, cos = rope
+        assert sin.ndim == cos.ndim
+        if sin.ndim == 4:
+            return sin[indices], cos[indices]
+        else:
+            return sin, cos
+    def _forward(self, x: Tensor, rope=None) -> Tensor:
+        b, _, _ = x.shape
+        sample_subset_size = max(int(b * (1 - self.sample_drop_ratio)), 1)
+        residual_scale_factor = b / sample_subset_size
+        if self.training and self.sample_drop_ratio > 0.0:
+            indices_1 = (torch.randperm(b, device=x.device))[:sample_subset_size]
+            x_subset_1 = x[indices_1]
+            rope_subset = self._maybe_index_rope(rope, indices_1)
+            residual_1 = self.attn(self.norm1(x_subset_1), rope=rope_subset)
+            x_attn = torch.index_add(
+                x,
+                dim=0,
+                source=self.ls1(residual_1),
+                index=indices_1,
+                alpha=residual_scale_factor,
+            )
+            indices_2 = (torch.randperm(b, device=x.device))[:sample_subset_size]
+            x_subset_2 = x_attn[indices_2]
+            residual_2 = self.mlp(self.norm2(x_subset_2))
+            x_ffn = torch.index_add(
+                x_attn,
+                dim=0,
+                source=self.ls2(residual_2),
+                index=indices_2,
+                alpha=residual_scale_factor,
+            )
+        else:
+            x_attn = x + self.ls1(self.attn(self.norm1(x), rope=rope))
+            x_ffn = x_attn + self.ls2(self.mlp(self.norm2(x_attn)))
+        return x_ffn
+    def _forward_list(self, x_list: List[Tensor], rope_list=None) -> List[Tensor]:
+        b_list = [x.shape[0] for x in x_list]
+        sample_subset_sizes = [max(int(b * (1 - self.sample_drop_ratio)), 1) for b in b_list]
+        residual_scale_factors = [b / sample_subset_size for b, sample_subset_size in zip(b_list, sample_subset_sizes)]
+        if self.training and self.sample_drop_ratio > 0.0:
+            indices_1_list = [
+                (torch.randperm(b, device=x.device))[:sample_subset_size]
+                for x, b, sample_subset_size in zip(x_list, b_list, sample_subset_sizes)
+            ]
+            x_subset_1_list = [x[indices_1] for x, indices_1 in zip(x_list, indices_1_list)]
+            if rope_list is not None:
+                rope_subset_list = [
+                    self._maybe_index_rope(rope, indices_1) for rope, indices_1 in zip(rope_list, indices_1_list)
+                ]
+            else:
+                rope_subset_list = rope_list
+            flattened, shapes, num_tokens = cat_keep_shapes(x_subset_1_list)
+            norm1 = uncat_with_shapes(self.norm1(flattened), shapes, num_tokens)
+            residual_1_list = self.attn.forward_list(norm1, rope_list=rope_subset_list)
+            x_attn_list = [
+                torch.index_add(
+                    x,
+                    dim=0,
+                    source=self.ls1(residual_1),
+                    index=indices_1,
+                    alpha=residual_scale_factor,
+                )
+                for x, residual_1, indices_1, residual_scale_factor in zip(
+                    x_list, residual_1_list, indices_1_list, residual_scale_factors
+                )
+            ]
+            indices_2_list = [
+                (torch.randperm(b, device=x.device))[:sample_subset_size]
+                for x, b, sample_subset_size in zip(x_list, b_list, sample_subset_sizes)
+            ]
+            x_subset_2_list = [x[indices_2] for x, indices_2 in zip(x_attn_list, indices_2_list)]
+            flattened, shapes, num_tokens = cat_keep_shapes(x_subset_2_list)
+            norm2_flat = self.norm2(flattened)
+            norm2_list = uncat_with_shapes(norm2_flat, shapes, num_tokens)
+            residual_2_list = self.mlp.forward_list(norm2_list)
+            x_ffn = [
+                torch.index_add(
+                    x_attn,
+                    dim=0,
+                    source=self.ls2(residual_2),
+                    index=indices_2,
+                    alpha=residual_scale_factor,
+                )
+                for x_attn, residual_2, indices_2, residual_scale_factor in zip(
+                    x_attn_list, residual_2_list, indices_2_list, residual_scale_factors
+                )
+            ]
+        else:
+            x_out = []
+            for x, rope in zip(x_list, rope_list):
+                x_attn = x + self.ls1(self.attn(self.norm1(x), rope=rope))
+                x_ffn = x_attn + self.ls2(self.mlp(self.norm2(x_attn)))
+                x_out.append(x_ffn)
+            x_ffn = x_out
+        return x_ffn
+    def forward(self, x_or_x_list, rope_or_rope_list=None) -> List[Tensor]:
+        if isinstance(x_or_x_list, Tensor):
+            return self._forward_list([x_or_x_list], rope_list=[rope_or_rope_list])[0]
+        elif isinstance(x_or_x_list, list):
+            if rope_or_rope_list is None:
+                rope_or_rope_list = [None for x in x_or_x_list]
+            return self._forward_list(x_or_x_list, rope_list=rope_or_rope_list)
+        else:
+            raise AssertionError
+class CausalSelfAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        ffn_ratio: float = 4.0,
+        ls_init_value: Optional[float] = None,
+        is_causal: bool = True,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        dropout_prob: float = 0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.is_causal = is_causal
+        self.ls1 = LayerScale(dim, init_values=ls_init_value) if ls_init_value else nn.Identity()
+        self.attention_norm = norm_layer(dim)
+        self.attention = CausalSelfAttention(dim, num_heads, attn_drop=dropout_prob, proj_drop=dropout_prob)
+        self.ffn_norm = norm_layer(dim)
+        ffn_hidden_dim = int(dim * ffn_ratio)
+        self.feed_forward = Mlp(
+            in_features=dim,
+            hidden_features=ffn_hidden_dim,
+            drop=dropout_prob,
+            act_layer=act_layer,
+        )
+        self.ls2 = LayerScale(dim, init_values=ls_init_value) if ls_init_value else nn.Identity()
+    def init_weights(
+        self,
+        init_attn_std: float | None = None,
+        init_proj_std: float | None = None,
+        init_fc_std: float | None = None,
+        factor: float = 1.0,
+    ) -> None:
+        init_attn_std = init_attn_std or (self.dim**-0.5)
+        init_proj_std = init_proj_std or init_attn_std * factor
+        init_fc_std = init_fc_std or (2 * self.dim) ** -0.5
+        self.attention.init_weights(init_attn_std, init_proj_std)
+        self.attention_norm.reset_parameters()
+        nn.init.normal_(self.feed_forward.fc1.weight, std=init_fc_std)
+        nn.init.normal_(self.feed_forward.fc2.weight, std=init_proj_std)
+        self.ffn_norm.reset_parameters()
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        x_attn = x + self.ls1(self.attention(self.attention_norm(x), self.is_causal))
+        x_ffn = x_attn + self.ls2(self.feed_forward(self.ffn_norm(x_attn)))
+        return x_ffn

eupe/layers/ffn_layers.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import Callable, List, Optional
+import torch.nn.functional as F
+from torch import Tensor, nn
+from eupe.utils import cat_keep_shapes, uncat_with_shapes
+class ListForwardMixin(object):
+    def forward(self, x: Tensor):
+        raise NotImplementedError
+    def forward_list(self, x_list: List[Tensor]) -> List[Tensor]:
+        x_flat, shapes, num_tokens = cat_keep_shapes(x_list)
+        x_flat = self.forward(x_flat)
+        return uncat_with_shapes(x_flat, shapes, num_tokens)
+class Mlp(nn.Module, ListForwardMixin):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+        device=None,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, device=device)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, device=device)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class SwiGLUFFN(nn.Module, ListForwardMixin):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Optional[Callable[..., nn.Module]] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+        align_to: int = 8,
+        device=None,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        d = int(hidden_features * 2 / 3)
+        swiglu_hidden_features = d + (-d % align_to)
+        self.w1 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
+        self.w2 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
+        self.w3 = nn.Linear(swiglu_hidden_features, out_features, bias=bias, device=device)
+    def forward(self, x: Tensor) -> Tensor:
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)

eupe/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import Union
+import torch
+from torch import Tensor, nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+        device=None,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(torch.empty(dim, device=device))
+        self.init_values = init_values
+    def reset_parameters(self):
+        nn.init.constant_(self.gamma, self.init_values)
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

eupe/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import math
+from typing import Callable, Tuple, Union
+from torch import Tensor, nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Callable | None = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+    def reset_parameters(self):
+        k = 1 / (self.in_chans * (self.patch_size[0] ** 2))
+        nn.init.uniform_(self.proj.weight, -math.sqrt(k), math.sqrt(k))
+        if self.proj.bias is not None:
+            nn.init.uniform_(self.proj.bias, -math.sqrt(k), math.sqrt(k))

eupe/layers/rms_norm.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+from torch import Tensor, nn
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def reset_parameters(self) -> None:
+        nn.init.constant_(self.weight, 1)
+    def _norm(self, x: Tensor) -> Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight

eupe/layers/rope_position_encoding.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import math
+from typing import Literal
+import torch
+from torch import Tensor, nn
+class RopePositionEmbedding(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        *,
+        num_heads: int,
+        base: float | None = 100.0,
+        min_period: float | None = None,
+        max_period: float | None = None,
+        normalize_coords: Literal["min", "max", "separate"] = "separate",
+        shift_coords: float | None = None,
+        jitter_coords: float | None = None,
+        rescale_coords: float | None = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ):
+        super().__init__()
+        assert embed_dim % (4 * num_heads) == 0
+        both_periods = min_period is not None and max_period is not None
+        if (base is None and not both_periods) or (base is not None and both_periods):
+            raise ValueError("Either `base` or `min_period`+`max_period` must be provided.")
+        D_head = embed_dim // num_heads
+        self.base = base
+        self.min_period = min_period
+        self.max_period = max_period
+        self.D_head = D_head
+        self.normalize_coords = normalize_coords
+        self.shift_coords = shift_coords
+        self.jitter_coords = jitter_coords
+        self.rescale_coords = rescale_coords
+        self.dtype = dtype
+        self.register_buffer(
+            "periods",
+            torch.empty(D_head // 4, device=device, dtype=dtype),
+            persistent=True,
+        )
+        self._init_weights()
+    def forward(self, *, H: int, W: int) -> tuple[Tensor, Tensor]:
+        device = self.periods.device
+        dtype = self.dtype
+        dd = {"device": device, "dtype": dtype}
+        if self.normalize_coords == "max":
+            max_HW = max(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / max_HW
+            coords_w = torch.arange(0.5, W, **dd) / max_HW
+        elif self.normalize_coords == "min":
+            min_HW = min(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / min_HW
+            coords_w = torch.arange(0.5, W, **dd) / min_HW
+        elif self.normalize_coords == "separate":
+            coords_h = torch.arange(0.5, H, **dd) / H
+            coords_w = torch.arange(0.5, W, **dd) / W
+        else:
+            raise ValueError(f"Unknown normalize_coords: {self.normalize_coords}")
+        coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)
+        coords = coords.flatten(0, 1)
+        coords = 2.0 * coords - 1.0
+        if self.training and self.shift_coords is not None:
+            shift_hw = torch.empty(2, **dd).uniform_(-self.shift_coords, self.shift_coords)
+            coords += shift_hw[None, :]
+        if self.training and self.jitter_coords is not None:
+            jitter_max = math.log(self.jitter_coords)
+            jitter_min = -jitter_max
+            jitter_hw = torch.empty(2, **dd).uniform_(jitter_min, jitter_max).exp()
+            coords *= jitter_hw[None, :]
+        if self.training and self.rescale_coords is not None:
+            rescale_max = math.log(self.rescale_coords)
+            rescale_min = -rescale_max
+            rescale_hw = torch.empty(1, **dd).uniform_(rescale_min, rescale_max).exp()
+            coords *= rescale_hw
+        angles = 2 * math.pi * coords[:, :, None] / self.periods[None, None, :]
+        angles = angles.flatten(1, 2)
+        angles = angles.tile(2)
+        cos = torch.cos(angles)
+        sin = torch.sin(angles)
+        return (sin, cos)
+    def _init_weights(self):
+        device = self.periods.device
+        dtype = self.dtype
+        if self.base is not None:
+            periods = self.base ** (
+                2 * torch.arange(self.D_head // 4, device=device, dtype=dtype) / (self.D_head // 2)
+            )
+        else:
+            base = self.max_period / self.min_period
+            exponents = torch.linspace(0, 1, self.D_head // 4, device=device, dtype=dtype)
+            periods = base**exponents
+            periods = periods / base
+            periods = periods * self.max_period
+        self.periods.data = periods

eupe/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .vision_transformer import DinoVisionTransformer
2	+

eupe/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (246 Bytes). View file

eupe/models/__pycache__/vision_transformer.cpython-312.pyc ADDED Viewed

Binary file (16.7 kB). View file

eupe/models/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import logging
+from functools import partial
+from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn.init
+from torch import Tensor, nn
+from eupe.layers import LayerScale, Mlp, PatchEmbed, RMSNorm, RopePositionEmbedding, SelfAttentionBlock, SwiGLUFFN
+from eupe.utils import named_apply
+logger = logging.getLogger("eupe")
+ffn_layer_dict = {
+    "mlp": Mlp,
+    "swiglu": SwiGLUFFN,
+    "swiglu32": partial(SwiGLUFFN, align_to=32),
+    "swiglu64": partial(SwiGLUFFN, align_to=64),
+    "swiglu128": partial(SwiGLUFFN, align_to=128),
+}
+norm_layer_dict = {
+    "layernorm": partial(nn.LayerNorm, eps=1e-6),
+    "layernormbf16": partial(nn.LayerNorm, eps=1e-5),
+    "rmsnorm": RMSNorm,
+}
+dtype_dict = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+}
+def init_weights_vit(module: nn.Module, name: str = ""):
+    if isinstance(module, nn.Linear):
+        torch.nn.init.trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+        if hasattr(module, "bias_mask") and module.bias_mask is not None:
+            o = module.out_features
+            module.bias_mask.fill_(1)
+            module.bias_mask[o // 3 : 2 * o // 3].fill_(0)
+    if isinstance(module, nn.LayerNorm):
+        module.reset_parameters()
+    if isinstance(module, LayerScale):
+        module.reset_parameters()
+    if isinstance(module, PatchEmbed):
+        module.reset_parameters()
+    if isinstance(module, RMSNorm):
+        module.reset_parameters()
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        img_size: int = 224,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        pos_embed_rope_base: float = 100.0,
+        pos_embed_rope_min_period: float | None = None,
+        pos_embed_rope_max_period: float | None = None,
+        pos_embed_rope_normalize_coords: Literal["min", "max", "separate"] = "separate",
+        pos_embed_rope_shift_coords: float | None = None,
+        pos_embed_rope_jitter_coords: float | None = None,
+        pos_embed_rope_rescale_coords: float | None = None,
+        pos_embed_rope_dtype: str = "bf16",
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        ffn_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop_path_rate: float = 0.0,
+        layerscale_init: float | None = None,
+        norm_layer: str = "layernorm",
+        ffn_layer: str = "mlp",
+        ffn_bias: bool = True,
+        proj_bias: bool = True,
+        n_storage_tokens: int = 0,
+        mask_k_bias: bool = False,
+        untie_cls_and_patch_norms: bool = False,
+        untie_global_and_local_cls_norm: bool = False,
+        device: Any | None = None,
+        **ignored_kwargs,
+    ):
+        super().__init__()
+        if len(ignored_kwargs) > 0:
+            logger.warning(f"Ignored kwargs: {ignored_kwargs}")
+        del ignored_kwargs
+        norm_layer_cls = norm_layer_dict[norm_layer]
+        self.num_features = self.embed_dim = embed_dim
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            flatten_embedding=False,
+        )
+        self.cls_token = nn.Parameter(torch.empty(1, 1, embed_dim, device=device))
+        self.n_storage_tokens = n_storage_tokens
+        if self.n_storage_tokens > 0:
+            self.storage_tokens = nn.Parameter(torch.empty(1, n_storage_tokens, embed_dim, device=device))
+        logger.info(f"using base={pos_embed_rope_base} for rope new")
+        logger.info(f"using min_period={pos_embed_rope_min_period} for rope new")
+        logger.info(f"using max_period={pos_embed_rope_max_period} for rope new")
+        logger.info(f"using normalize_coords={pos_embed_rope_normalize_coords} for rope new")
+        logger.info(f"using shift_coords={pos_embed_rope_shift_coords} for rope new")
+        logger.info(f"using rescale_coords={pos_embed_rope_rescale_coords} for rope new")
+        logger.info(f"using jitter_coords={pos_embed_rope_jitter_coords} for rope new")
+        logger.info(f"using dtype={pos_embed_rope_dtype} for rope new")
+        self.rope_embed = RopePositionEmbedding(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            base=pos_embed_rope_base,
+            min_period=pos_embed_rope_min_period,
+            max_period=pos_embed_rope_max_period,
+            normalize_coords=pos_embed_rope_normalize_coords,
+            shift_coords=pos_embed_rope_shift_coords,
+            jitter_coords=pos_embed_rope_jitter_coords,
+            rescale_coords=pos_embed_rope_rescale_coords,
+            dtype=dtype_dict[pos_embed_rope_dtype],
+            device=device,
+        )
+        logger.info(f"using {ffn_layer} layer as FFN")
+        ffn_layer_cls = ffn_layer_dict[ffn_layer]
+        ffn_ratio_sequence = [ffn_ratio] * depth
+        blocks_list = [
+            SelfAttentionBlock(
+                dim=embed_dim,
+                num_heads=num_heads,
+                ffn_ratio=ffn_ratio_sequence[i],
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=drop_path_rate,
+                norm_layer=norm_layer_cls,
+                act_layer=nn.GELU,
+                ffn_layer=ffn_layer_cls,
+                init_values=layerscale_init,
+                mask_k_bias=mask_k_bias,
+                device=device,
+            )
+            for i in range(depth)
+        ]
+        self.chunked_blocks = False
+        self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer_cls(embed_dim)
+        self.untie_cls_and_patch_norms = untie_cls_and_patch_norms
+        if untie_cls_and_patch_norms:
+            self.cls_norm = norm_layer_cls(embed_dim)
+        else:
+            self.cls_norm = None
+        self.untie_global_and_local_cls_norm = untie_global_and_local_cls_norm
+        if untie_global_and_local_cls_norm:
+            self.local_cls_norm = norm_layer_cls(embed_dim)
+        else:
+            self.local_cls_norm = None
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.empty(1, embed_dim, device=device))
+    def init_weights(self):
+        self.rope_embed._init_weights()
+        nn.init.normal_(self.cls_token, std=0.02)
+        if self.n_storage_tokens > 0:
+            nn.init.normal_(self.storage_tokens, std=0.02)
+        nn.init.zeros_(self.mask_token)
+        named_apply(init_weights_vit, self)
+    def prepare_tokens_with_masks(self, x: Tensor, masks=None) -> Tuple[Tensor, Tuple[int]]:
+        x = self.patch_embed(x)
+        B, H, W, _ = x.shape
+        x = x.flatten(1, 2)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+            cls_token = self.cls_token
+        else:
+            cls_token = self.cls_token + 0 * self.mask_token
+        if self.n_storage_tokens > 0:
+            storage_tokens = self.storage_tokens
+        else:
+            storage_tokens = torch.empty(
+                1,
+                0,
+                cls_token.shape[-1],
+                dtype=cls_token.dtype,
+                device=cls_token.device,
+            )
+        x = torch.cat(
+            [
+                cls_token.expand(B, -1, -1),
+                storage_tokens.expand(B, -1, -1),
+                x,
+            ],
+            dim=1,
+        )
+        return x, (H, W)
+    def forward_features_list(self, x_list: List[Tensor], masks_list: List[Tensor]) -> List[Dict[str, Tensor]]:
+        x = []
+        rope = []
+        for t_x, t_masks in zip(x_list, masks_list):
+            t2_x, hw_tuple = self.prepare_tokens_with_masks(t_x, t_masks)
+            x.append(t2_x)
+            rope.append(hw_tuple)
+        for _, blk in enumerate(self.blocks):
+            if self.rope_embed is not None:
+                rope_sincos = [self.rope_embed(H=H, W=W) for H, W in rope]
+            else:
+                rope_sincos = [None for _ in rope]
+            x = blk(x, rope_sincos)
+        all_x = x
+        output = []
+        for idx, (x, masks) in enumerate(zip(all_x, masks_list)):
+            if self.untie_cls_and_patch_norms or self.untie_global_and_local_cls_norm:
+                if self.untie_global_and_local_cls_norm and self.training and idx == 1:
+                    x_norm_cls_reg = self.local_cls_norm(x[:, : self.n_storage_tokens + 1])
+                elif self.untie_cls_and_patch_norms:
+                    x_norm_cls_reg = self.cls_norm(x[:, : self.n_storage_tokens + 1])
+                else:
+                    x_norm_cls_reg = self.norm(x[:, : self.n_storage_tokens + 1])
+                x_norm_patch = self.norm(x[:, self.n_storage_tokens + 1 :])
+            else:
+                x_norm = self.norm(x)
+                x_norm_cls_reg = x_norm[:, : self.n_storage_tokens + 1]
+                x_norm_patch = x_norm[:, self.n_storage_tokens + 1 :]
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm_cls_reg[:, 0],
+                    "x_storage_tokens": x_norm_cls_reg[:, 1:],
+                    "x_norm_patchtokens": x_norm_patch,
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x: Tensor | List[Tensor], masks: Optional[Tensor] = None) -> List[Dict[str, Tensor]]:
+        if isinstance(x, torch.Tensor):
+            return self.forward_features_list([x], [masks])[0]
+        else:
+            return self.forward_features_list(x, masks)
+    def _get_intermediate_layers_not_chunked(self, x: Tensor, n: int = 1) -> List[Tensor]:
+        x, (H, W) = self.prepare_tokens_with_masks(x)
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            if self.rope_embed is not None:
+                rope_sincos = self.rope_embed(H=H, W=W)
+            else:
+                rope_sincos = None
+            x = blk(x, rope_sincos)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        *,
+        n: Union[int, Sequence] = 1,
+        reshape: bool = False,
+        return_class_token: bool = False,
+        return_extra_tokens: bool = False,
+        norm: bool = True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, ...]]]:
+        outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs_normed = []
+            for out in outputs:
+                if self.untie_cls_and_patch_norms:
+                    x_norm_cls_reg = self.cls_norm(out[:, : self.n_storage_tokens + 1])
+                    x_norm_patch = self.norm(out[:, self.n_storage_tokens + 1 :])
+                    outputs_normed.append(torch.cat((x_norm_cls_reg, x_norm_patch), dim=1))
+                else:
+                    outputs_normed.append(self.norm(out))
+            outputs = outputs_normed
+        class_tokens = [out[:, 0] for out in outputs]
+        extra_tokens = [out[:, 1 : self.n_storage_tokens + 1] for out in outputs]
+        outputs = [out[:, self.n_storage_tokens + 1 :] for out in outputs]
+        if reshape:
+            B, _, h, w = x.shape
+            outputs = [
+                out.reshape(B, h // self.patch_size, w // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if not return_class_token and not return_extra_tokens:
+            return tuple(outputs)
+        elif return_class_token and not return_extra_tokens:
+            return tuple(zip(outputs, class_tokens))
+        elif not return_class_token and return_extra_tokens:
+            return tuple(zip(outputs, extra_tokens))
+        elif return_class_token and return_extra_tokens:
+            return tuple(zip(outputs, class_tokens, extra_tokens))
+    def forward(self, *args, is_training: bool = False, **kwargs) -> List[Dict[str, Tensor]] | Tensor:
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])

eupe/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .utils import cat_keep_shapes, named_apply, uncat_with_shapes
2	+

eupe/utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (279 Bytes). View file

eupe/utils/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (3.08 kB). View file

eupe/utils/utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import random
+from typing import Callable, List, Tuple
+import numpy as np
+import torch
+from torch import Tensor, nn
+def cat_keep_shapes(x_list: List[Tensor]) -> Tuple[Tensor, List[Tuple[int]], List[int]]:
+    shapes = [x.shape for x in x_list]
+    num_tokens = [x.select(dim=-1, index=0).numel() for x in x_list]
+    flattened = torch.cat([x.flatten(0, -2) for x in x_list])
+    return flattened, shapes, num_tokens
+def uncat_with_shapes(flattened: Tensor, shapes: List[Tuple[int]], num_tokens: List[int]) -> List[Tensor]:
+    outputs_splitted = torch.split_with_sizes(flattened, num_tokens, dim=0)
+    shapes_adjusted = [shape[:-1] + torch.Size([flattened.shape[-1]]) for shape in shapes]
+    outputs_reshaped = [o.reshape(shape) for o, shape in zip(outputs_splitted, shapes_adjusted)]
+    return outputs_reshaped
+def named_apply(
+    fn: Callable,
+    module: nn.Module,
+    name: str = "",
+    depth_first: bool = True,
+    include_root: bool = False,
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+def fix_random_seeds(seed: int = 31):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)

transformers_eupe.py CHANGED Viewed

@@ -112,6 +112,7 @@ class EupeViTModel(PreTrainedModel):
             mask_k_bias=config.mask_k_bias,
         )
         self.vit.init_weights()
     def _init_weights(self, module: nn.Module) -> None:
         # Signature required by PreTrainedModel; initialization is delegated to DinoVisionTransformer.

             mask_k_bias=config.mask_k_bias,
         )
         self.vit.init_weights()
+        self.post_init()
     def _init_weights(self, module: nn.Module) -> None:
         # Signature required by PreTrainedModel; initialization is delegated to DinoVisionTransformer.