DINOv2

DINOv2_base ¶

DINOv2_base(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 base model.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision for more details.

Attributes:

Name	Type	Description
`embedding_dim`	`int`	768
`patch_size`	`int`	14
`image_size`	`int`	518
`num_layers`	`int`	12
`num_heads`	`int`	12

Parameters:

Name	Type	Description	Default
`device`	`device \| str \| None`	The PyTorch device to use.	`None`
`dtype`	`dtype \| None`	The PyTorch data type to use.	`None`

Source code in src/refiners/foundationals/dinov2/dinov2.py

def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 base model.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=768,
        patch_size=14,
        image_size=518,
        num_layers=12,
        num_heads=12,
        device=device,
        dtype=dtype,
    )

DINOv2_base_reg ¶

DINOv2_base_reg(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 base model with register.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision and [arXiv:2309.16588] Vision Transformers Need Registers for more details.

Attributes:

Name	Type	Description
`embedding_dim`	`int`	768
`patch_size`	`int`	14
`image_size`	`int`	518
`num_layers`	`int`	12
`num_heads`	`int`	12
`num_registers`	`int`	4
`interpolate_antialias`	`bool`	True

Parameters:

Name	Type	Description	Default
`device`	`device \| str \| None`	The PyTorch device to use.	`None`
`dtype`	`dtype \| None`	The PyTorch data type to use.	`None`

Source code in src/refiners/foundationals/dinov2/dinov2.py

def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 base model with register.

    Args:
        device (torch.device | str | None): The PyTorch device to use.
        dtype (torch.dtype | None): The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=768,
        patch_size=14,
        image_size=518,
        num_layers=12,
        num_heads=12,
        num_registers=4,
        interpolate_antialias=True,
        device=device,
        dtype=dtype,
    )

DINOv2_giant ¶

DINOv2_giant(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 giant model.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision for more details.

Attributes:

Name	Type	Description
`embedding_dim`	`int`	1536
`feedforward_dim`	`int`	4096
`patch_size`	`int`	14
`image_size`	`int`	518
`num_layers`	`int`	40
`num_heads`	`int`	24

Parameters:

Name	Type	Description	Default
`device`	`device \| str \| None`	The PyTorch device to use.	`None`
`dtype`	`dtype \| None`	The PyTorch data type to use.	`None`

Source code in src/refiners/foundationals/dinov2/dinov2.py

def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 giant model.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1536,
        feedforward_dim=4096,
        patch_size=14,
        image_size=518,
        num_layers=40,
        num_heads=24,
        activation=GLU(SiLU()),
        device=device,
        dtype=dtype,
    )

DINOv2_giant_reg ¶

DINOv2_giant_reg(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 giant model with register.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision and [arXiv:2309.16588] Vision Transformers Need Registers

Attributes:

Name	Type	Description
`embedding_dim`	`int`	1536
`feedforward_dim`	`int`	4096
`patch_size`	`int`	14
`image_size`	`int`	518
`num_layers`	`int`	40
`num_heads`	`int`	24
`num_registers`	`int`	4
`interpolate_antialias`	`bool`	True

Parameters:

Name	Type	Description	Default
`device`	`device \| str \| None`	The PyTorch device to use.	`None`
`dtype`	`dtype \| None`	The PyTorch data type to use.	`None`

Source code in src/refiners/foundationals/dinov2/dinov2.py

def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 giant model with register.

    Args:
        device (torch.device | str | None): The PyTorch device to use.
        dtype (torch.dtype | None): The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1536,
        feedforward_dim=4096,
        patch_size=14,
        image_size=518,
        num_layers=40,
        num_heads=24,
        num_registers=4,
        interpolate_antialias=True,
        activation=GLU(SiLU()),
        device=device,
        dtype=dtype,
    )

DINOv2_large ¶

DINOv2_large(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 large model.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision for more details.

Attributes:

Name	Type	Description
`embedding_dim`	`int`	1024
`patch_size`	`int`	14
`image_size`	`int`	518
`num_layers`	`int`	24
`num_heads`	`int`	16

Parameters:

Name	Type	Description	Default
`device`	`device \| str \| None`	The PyTorch device to use.	`None`
`dtype`	`dtype \| None`	The PyTorch data type to use.	`None`

Source code in src/refiners/foundationals/dinov2/dinov2.py

def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 large model.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1024,
        patch_size=14,
        image_size=518,
        num_layers=24,
        num_heads=16,
        device=device,
        dtype=dtype,
    )

DINOv2_large_reg ¶

DINOv2_large_reg(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 large model with register.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision and [arXiv:2309.16588] Vision Transformers Need Registers for more details.

Attributes:

Name	Type	Description
`embedding_dim`	`int`	1024
`patch_size`	`int`	14
`image_size`	`int`	518
`num_layers`	`int`	24
`num_heads`	`int`	16
`num_registers`	`int`	4
`interpolate_antialias`	`bool`	True

Parameters:

Name	Type	Description	Default
`device`	`device \| str \| None`	The PyTorch device to use.	`None`
`dtype`	`dtype \| None`	The PyTorch data type to use.	`None`

Source code in src/refiners/foundationals/dinov2/dinov2.py

def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 large model with register.

    Args:
        device (torch.device | str | None): The PyTorch device to use.
        dtype (torch.dtype | None): The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1024,
        patch_size=14,
        image_size=518,
        num_layers=24,
        num_heads=16,
        num_registers=4,
        interpolate_antialias=True,
        device=device,
        dtype=dtype,
    )

DINOv2_small ¶

DINOv2_small(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 small model.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision for more details.

Attributes:

Name	Type	Description
`embedding_dim`	`int`	384
`patch_size`	`int`	14
`image_size`	`int`	518
`num_layers`	`int`	12
`num_heads`	`int`	6

Parameters:

Name	Type	Description	Default
`device`	`device \| str \| None`	The PyTorch device to use.	`None`
`dtype`	`dtype \| None`	The PyTorch data type to use.	`None`

Source code in src/refiners/foundationals/dinov2/dinov2.py

def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 small model.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=384,
        patch_size=14,
        image_size=518,
        num_layers=12,
        num_heads=6,
        device=device,
        dtype=dtype,
    )

DINOv2_small_reg ¶

DINOv2_small_reg(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 small model with register.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision and [arXiv:2309.16588] Vision Transformers Need Registers for more details.

Attributes:

Name	Type	Description
`embedding_dim`	`int`	384
`patch_size`	`int`	14
`image_size`	`int`	518
`num_layers`	`int`	12
`num_heads`	`int`	6
`num_registers`	`int`	4
`interpolate_antialias`	`bool`	True

Parameters:

Name	Type	Description	Default
`device`	`device \| str \| None`	The PyTorch device to use.	`None`
`dtype`	`dtype \| None`	The PyTorch data type to use.	`None`

Source code in src/refiners/foundationals/dinov2/dinov2.py

def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 small model with register.

    Args:
        device (torch.device | str | None): The PyTorch device to use.
        dtype (torch.dtype | None): The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=384,
        patch_size=14,
        image_size=518,
        num_layers=12,
        num_heads=6,
        num_registers=4,
        interpolate_antialias=True,
        device=device,
        dtype=dtype,
    )

ViT ¶

ViT(
    embedding_dim: int = 768,
    patch_size: int = 16,
    image_size: int = 224,
    num_layers: int = 12,
    num_heads: int = 12,
    norm_eps: float = 1e-06,
    mlp_ratio: int = 4,
    num_registers: int = 0,
    activation: Activation = fl.GeLU(),
    feedforward_dim: int | None = None,
    interpolate_antialias: bool = False,
    interpolate_mode: str = "bicubic",
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: Chain

Vision Transformer (ViT) model.

See [arXiv:2010.11929] An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale for more details.

Parameters:

Name	Type	Description	Default
`embedding_dim`	`int`	The dimension of the embedding.	`768`
`patch_size`	`int`	The size of the patches.	`16`
`image_size`	`int`	The size of the input image.	`224`
`num_layers`	`int`	The number of layers.	`12`
`num_heads`	`int`	The number of heads.	`12`
`norm_eps`	`float`	The epsilon value for normalization.	`1e-06`
`mlp_ratio`	`int`	The ratio for the multi-layer perceptron (MLP).	`4`
`num_registers`	`int`	The number of registers.	`0`
`activation`	`Activation`	The activation function.	`GeLU()`
`feedforward_dim`	`int \| None`	The dimension of the feedforward layer.	`None`
`interpolate_antialias`	`bool`	Whether to use antialiasing for interpolation.	`False`
`interpolate_mode`	`str`	The interpolation mode.	`'bicubic'`
`device`	`device \| str \| None`	The PyTorch device to use.	`None`
`dtype`	`dtype \| None`	The PyTorch data type to use.	`None`

Source code in src/refiners/foundationals/dinov2/vit.py

def __init__(
    self,
    embedding_dim: int = 768,
    patch_size: int = 16,
    image_size: int = 224,
    num_layers: int = 12,
    num_heads: int = 12,
    norm_eps: float = 1e-6,
    mlp_ratio: int = 4,
    num_registers: int = 0,
    activation: Activation = fl.GeLU(),
    feedforward_dim: int | None = None,
    interpolate_antialias: bool = False,
    interpolate_mode: str = "bicubic",
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize a Vision Transformer (ViT) model.

    Args:
        embedding_dim: The dimension of the embedding.
        patch_size: The size of the patches.
        image_size: The size of the input image.
        num_layers: The number of layers.
        num_heads: The number of heads.
        norm_eps: The epsilon value for normalization.
        mlp_ratio: The ratio for the multi-layer perceptron (MLP).
        num_registers: The number of registers.
        activation: The activation function.
        feedforward_dim: The dimension of the feedforward layer.
        interpolate_antialias: Whether to use antialiasing for interpolation.
        interpolate_mode: The interpolation mode.
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    num_patches = image_size // patch_size
    self.embedding_dim = embedding_dim
    self.patch_size = patch_size
    self.image_size = image_size
    self.num_layers = num_layers
    self.num_heads = num_heads
    self.norm_eps = norm_eps
    self.mlp_ratio = mlp_ratio
    self.num_registers = num_registers
    self.feedforward_dim = feedforward_dim

    super().__init__(
        fl.Concatenate(
            ClassToken(
                embedding_dim=embedding_dim,
                device=device,
                dtype=dtype,
            ),
            PatchEncoder(
                in_channels=3,
                out_channels=embedding_dim,
                patch_size=patch_size,
                device=device,
                dtype=dtype,
            ),
            dim=1,
        ),
        PositionalEncoder(
            PositionalEmbedding(
                sequence_length=num_patches**2 + 1,
                embedding_dim=embedding_dim,
                patch_size=patch_size,
                device=device,
                dtype=dtype,
            ),
            fl.Chain(
                fl.Parallel(
                    fl.Identity(),
                    fl.UseContext(context="dinov2_vit", key="input"),
                ),
                InterpolateEmbedding(
                    mode=interpolate_mode,
                    antialias=interpolate_antialias,
                    patch_size=patch_size,
                ),
            ),
        ),
        Transformer(
            TransformerLayer(
                embedding_dim=embedding_dim,
                feedforward_dim=feedforward_dim,
                activation=activation,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                norm_eps=norm_eps,
                device=device,
                dtype=dtype,
            )
            for _ in range(num_layers)
        ),
        fl.LayerNorm(
            normalized_shape=embedding_dim,
            eps=norm_eps,
            device=device,
            dtype=dtype,
        ),
    )

    if self.num_registers > 0:
        registers = Registers(
            num_registers=num_registers,
            embedding_dim=embedding_dim,
            device=device,
            dtype=dtype,
        )
        self.insert_before_type(Transformer, registers)

preprocess ¶

preprocess(img: Image, dim: int = 224) -> Tensor

Preprocess an image for use with DINOv2. Uses ImageNet mean and standard deviation. Note that this only resizes and normalizes the image, there is no center crop.

Parameters:

Name	Type	Description	Default
`img`	`Image`	The image.	required
`dim`	`int`	The square dimension to resize the image. Typically 224 or 518.	`224`

Returns:

Type	Description
`Tensor`	A float32 tensor with shape (3, dim, dim).

Source code in src/refiners/foundationals/dinov2/dinov2.py

def preprocess(img: Image.Image, dim: int = 224) -> torch.Tensor:
    """
    Preprocess an image for use with DINOv2. Uses ImageNet mean and standard deviation.
    Note that this only resizes and normalizes the image, there is no center crop.

    Args:
        img: The image.
        dim: The square dimension to resize the image. Typically 224 or 518.

    Returns:
        A float32 tensor with shape (3, dim, dim).
    """
    img = img.convert("RGB").resize((dim, dim))  # type: ignore
    t = image_to_tensor(img).squeeze()
    return normalize(t, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])