Skip to content

CLIP

CLIPImageEncoder

CLIPImageEncoder(
    image_size: int = 224,
    embedding_dim: int = 768,
    output_dim: int = 512,
    patch_size: int = 32,
    num_layers: int = 12,
    num_attention_heads: int = 12,
    feedforward_dim: int = 3072,
    layer_norm_eps: float = 1e-05,
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: Chain

Contrastive Language-Image Pretraining (CLIP) image encoder.

See [arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision for more details.

Parameters:

Name Type Description Default
image_size int

The size of the input image.

224
embedding_dim int

The dimension of the embedding.

768
output_dim int

The dimension of the output.

512
patch_size int

The size of the patches.

32
num_layers int

The number of layers.

12
num_attention_heads int

The number of attention heads.

12
feedforward_dim int

The dimension of the feedforward layer.

3072
layer_norm_eps float

The epsilon value for normalization.

1e-05
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/clip/image_encoder.py
def __init__(
    self,
    image_size: int = 224,
    embedding_dim: int = 768,
    output_dim: int = 512,
    patch_size: int = 32,
    num_layers: int = 12,
    num_attention_heads: int = 12,
    feedforward_dim: int = 3072,
    layer_norm_eps: float = 1e-5,
    device: Device | str | None = None,
    dtype: DType | None = None,
) -> None:
    """Initialize a CLIP image encoder.

    Args:
        image_size: The size of the input image.
        embedding_dim: The dimension of the embedding.
        output_dim: The dimension of the output.
        patch_size: The size of the patches.
        num_layers: The number of layers.
        num_attention_heads: The number of attention heads.
        feedforward_dim: The dimension of the feedforward layer.
        layer_norm_eps: The epsilon value for normalization.
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    self.image_size = image_size
    self.embedding_dim = embedding_dim
    self.output_dim = output_dim
    self.patch_size = patch_size
    self.num_layers = num_layers
    self.num_attention_heads = num_attention_heads
    self.feedforward_dim = feedforward_dim
    cls_token_pooling: Callable[[Tensor], Tensor] = lambda x: x[:, 0, :]
    super().__init__(
        ViTEmbeddings(
            image_size=image_size, embedding_dim=embedding_dim, patch_size=patch_size, device=device, dtype=dtype
        ),
        fl.LayerNorm(normalized_shape=embedding_dim, eps=layer_norm_eps, device=device, dtype=dtype),
        fl.Chain(
            TransformerLayer(
                embedding_dim=embedding_dim,
                feedforward_dim=feedforward_dim,
                num_attention_heads=num_attention_heads,
                layer_norm_eps=layer_norm_eps,
                device=device,
                dtype=dtype,
            )
            for _ in range(num_layers)
        ),
        fl.Lambda(func=cls_token_pooling),
        fl.LayerNorm(normalized_shape=embedding_dim, eps=layer_norm_eps, device=device, dtype=dtype),
        fl.Linear(in_features=embedding_dim, out_features=output_dim, bias=False, device=device, dtype=dtype),
    )

CLIPImageEncoderG

CLIPImageEncoderG(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: CLIPImageEncoder

CLIP giant image encoder.

See [arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision for more details.

Attributes:

Name Type Description
embedding_dim int

1664

output_dim int

1280

patch_size int

14

num_layers int

48

num_attention_heads int

16

feedforward_dim int

8192

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/clip/image_encoder.py
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
    """Initialize CLIP giant image encoder.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1664,
        output_dim=1280,
        patch_size=14,
        num_layers=48,
        num_attention_heads=16,
        feedforward_dim=8192,
        device=device,
        dtype=dtype,
    )

CLIPImageEncoderH

CLIPImageEncoderH(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: CLIPImageEncoder

CLIP huge image encoder.

See [arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision for more details.

Attributes:

Name Type Description
embedding_dim int

1280

output_dim int

1024

patch_size int

14

num_layers int

32

num_attention_heads int

16

feedforward_dim int

5120

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/clip/image_encoder.py
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
    """Initialize CLIP huge image encoder.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1280,
        output_dim=1024,
        patch_size=14,
        num_layers=32,
        num_attention_heads=16,
        feedforward_dim=5120,
        device=device,
        dtype=dtype,
    )

CLIPTextEncoder

CLIPTextEncoder(
    embedding_dim: int = 768,
    max_sequence_length: int = 77,
    vocabulary_size: int = 49408,
    num_layers: int = 12,
    num_attention_heads: int = 12,
    feedforward_dim: int = 3072,
    layer_norm_eps: float = 1e-05,
    use_quick_gelu: bool = False,
    tokenizer: CLIPTokenizer | None = None,
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: Chain

Contrastive Language-Image Pretraining (CLIP) text encoder.

See [arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision for more details.

Parameters:

Name Type Description Default
embedding_dim int

The embedding dimension.

768
max_sequence_length int

The maximum sequence length.

77
vocabulary_size int

The vocabulary size.

49408
num_layers int

The number of layers.

12
num_attention_heads int

The number of attention heads.

12
feedforward_dim int

The feedforward dimension.

3072
layer_norm_eps float

The epsilon value for layer normalization.

1e-05
use_quick_gelu bool

Whether to use the quick GeLU activation function.

False
tokenizer CLIPTokenizer | None

The tokenizer.

None
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/clip/text_encoder.py
def __init__(
    self,
    embedding_dim: int = 768,
    max_sequence_length: int = 77,
    vocabulary_size: int = 49408,
    num_layers: int = 12,
    num_attention_heads: int = 12,
    feedforward_dim: int = 3072,
    layer_norm_eps: float = 1e-5,
    use_quick_gelu: bool = False,
    tokenizer: CLIPTokenizer | None = None,
    device: Device | str | None = None,
    dtype: DType | None = None,
) -> None:
    """Initialize CLIP text encoder.

    Args:
        embedding_dim: The embedding dimension.
        max_sequence_length: The maximum sequence length.
        vocabulary_size: The vocabulary size.
        num_layers: The number of layers.
        num_attention_heads: The number of attention heads.
        feedforward_dim: The feedforward dimension.
        layer_norm_eps: The epsilon value for layer normalization.
        use_quick_gelu: Whether to use the quick GeLU activation function.
        tokenizer: The tokenizer.
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    self.embedding_dim = embedding_dim
    self.max_sequence_length = max_sequence_length
    self.vocabulary_size = vocabulary_size
    self.num_layers = num_layers
    self.num_attention_heads = num_attention_heads
    self.feedforward_dim = feedforward_dim
    self.layer_norm_eps = layer_norm_eps
    self.use_quick_gelu = use_quick_gelu
    super().__init__(
        tokenizer or CLIPTokenizer(sequence_length=max_sequence_length),
        fl.Converter(set_dtype=False),
        fl.Sum(
            TokenEncoder(
                vocabulary_size=vocabulary_size,
                embedding_dim=embedding_dim,
                device=device,
                dtype=dtype,
            ),
            PositionalEncoder(
                max_sequence_length=max_sequence_length,
                embedding_dim=embedding_dim,
                device=device,
                dtype=dtype,
            ),
        ),
        *(
            TransformerLayer(
                embedding_dim=embedding_dim,
                num_attention_heads=num_attention_heads,
                feedforward_dim=feedforward_dim,
                layer_norm_eps=layer_norm_eps,
                device=device,
                dtype=dtype,
            )
            for _ in range(num_layers)
        ),
        fl.LayerNorm(normalized_shape=embedding_dim, eps=layer_norm_eps, device=device, dtype=dtype),
    )
    if use_quick_gelu:
        for gelu, parent in self.walk(predicate=lambda m, _: isinstance(m, fl.GeLU)):
            parent.replace(
                old_module=gelu,
                new_module=fl.GeLU(approximation=fl.GeLUApproximation.SIGMOID),
            )

CLIPTextEncoderG

CLIPTextEncoderG(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: CLIPTextEncoder

CLIP giant text encoder.

See [arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision for more details.

Attributes:

Name Type Description
embedding_dim int

1280

num_layers int

32

num_attention_heads int

20

feedforward_dim int

5120

tokenizer CLIPTokenizer

CLIPTokenizer(pad_token_id=0)

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/clip/text_encoder.py
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
    """Initialize CLIP giant text encoder.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    tokenizer = CLIPTokenizer(pad_token_id=0)
    super().__init__(
        embedding_dim=1280,
        num_layers=32,
        num_attention_heads=20,
        feedforward_dim=5120,
        tokenizer=tokenizer,
        device=device,
        dtype=dtype,
    )

CLIPTextEncoderH

CLIPTextEncoderH(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: CLIPTextEncoder

CLIP huge text encoder.

See [arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision for more details.

Attributes:

Name Type Description
embedding_dim int

1024

num_layers int

23

num_attention_heads int

16

feedforward_dim int

4096

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/clip/text_encoder.py
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
    """Initialize CLIP huge text encoder.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1024,
        num_layers=23,
        num_attention_heads=16,
        feedforward_dim=4096,
        device=device,
        dtype=dtype,
    )

CLIPTextEncoderL

CLIPTextEncoderL(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: CLIPTextEncoder

CLIP large text encoder.

Note

We replace the GeLU activation function with an approximate GeLU to comply with the original CLIP implementation of OpenAI (https://github.com/openai/CLIP/blob/a1d0717/clip/model.py#L166)

See [arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision for more details.

Attributes:

Name Type Description
embedding_dim int

768

num_layers int

12

num_attention_heads int

12

feedforward_dim int

3072

use_quick_gelu bool

True

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/clip/text_encoder.py
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
    """Initialize CLIP large text encoder.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=768,
        num_layers=12,
        num_attention_heads=12,
        feedforward_dim=3072,
        use_quick_gelu=True,
        device=device,
        dtype=dtype,
    )