InstaDeepAI
/

ChatNT

@@ -28,6 +28,7 @@ class RotaryEmbeddingConfig:
 class PerceiverResamplerConfig:
     """
     Parameters to initialize an PerceiverResampler model.
     Args:
         emb_layer_norm_before: Whether to use layer norm before the first attention
             layer.
@@ -92,7 +93,9 @@ class PerceiverResamplerConfig:
 class GptConfig:
     """
     Parameters to initialize a Gpt model.
     NOTE: the pad token is not defined
     Args:
         vocab_size: Token vocabulary.
         eos_token_id: used to stop sentence generation
@@ -188,6 +191,7 @@ class GptConfig:
 class NucleotideTransformerConfig:
     """
     Parameters to initialize an NT model.
     Args:
         alphabet_size: Token vocabulary.
         pad_token_id: ID of pad token.
@@ -369,6 +373,7 @@ class TorchBioBrainDecoder(nn.Module):
         """
         Initializes the BioBrain decoder, using a GPT model for text generation with
         bio embeddings.
         Args:
             gpt_config: Configuration for the GPT model
             seq_token_id: Index of the SEQ token
@@ -385,11 +390,13 @@ class TorchBioBrainDecoder(nn.Module):
     ) -> torch.Tensor:
         """
         Forward pass through the model.
         Args:
             english_token_ids: Tensor of English token IDs with shape
                 (batch_size, num_english_tokens).
             projected_bio_embeddings: Optional tensor of bio embeddings with shape
                                       (batch_size, num_bio_sequences, ?, embed_dim).
         Returns:
             torch.Tensor: The logits from the GPT model,
                 shaped (batch_size, num_english_tokens, vocab_size).
@@ -445,11 +452,13 @@ class TorchBioBrainDecoder(nn.Module):
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Inserts resampled embeddings in input_embeddings, starting at the SEQ token
         Args:
             tokens (torch.Tensor): Shape (batch_size, num_tokens)
             input_embeddings (torch.Tensor): Shape (batch_size, num_tokens, embed_dim)
             resampled_embeddings (torch.Tensor):
                 Shape (batch_size, num_bio_sequences, bio_sequence_length, embed_dim)
         Returns:
             Tuple[torch.Tensor, torch.Tensor]:
                 - input_embeddings with resampled_embeddings inserted at the SEQ token
@@ -512,9 +521,11 @@ class TorchBioBrainDecoder(nn.Module):
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Removes the logits corresponding to the unused embeddings.
         Args:
             tokens: Input english tokens.
             logits: Input logits.
         Returns:
             Cleaned logits, last values will be equal to 0.
         """
@@ -629,34 +640,39 @@ class TorchMultiOmicsModel(PreTrainedModel):
     def forward(
         self,
-        multi_omics_tokens_ids: tuple[torch.Tensor, torch.Tensor | None],
         projection_english_tokens_ids: torch.Tensor,
         projected_bio_embeddings: torch.Tensor = None,
     ) -> dict[str, torch.Tensor]:
         """
         Args:
             multi_omics_tokens_ids (Tuple[torch.Tensor, torch.Tensor]):
                 english_tokens_ids: Represents the prompt tokens (english tokens)
                     Shape (batch_size, num_english_tokens)
                 bio_tokens_ids: Represents the bio sequences tokens
                     Shape (batch_size, num_bio_sequences, num_bio_tokens)
             projection_english_tokens_ids (torch.Tensor):
                 Shape (batch_size, num_english_tokens)
             projected_bio_embeddings (projected_bio_embeddings, optional):
                 Shape (batch_size, num_bio_sequencse, ?, embed_dim).
                 Defaults to None.
         Returns:
             dict[str, torch.Tensor] containing:
                 - logits:
                     Shape (batch_size, num_tokens, vocab_size)
                 - projected_bio_embeddings:
                     Shape (batch_size, num_bio_sequences, ?, embed_dim)
         """
         english_token_ids, bio_token_ids = multi_omics_tokens_ids
         english_token_ids = english_token_ids.clone()
         projection_english_tokens_ids = projection_english_tokens_ids.clone()
-        if bio_token_ids is not None:
-            bio_token_ids = bio_token_ids.clone()
         if projected_bio_embeddings is not None:
             projected_bio_embeddings = projected_bio_embeddings.clone()
@@ -724,6 +740,7 @@ class TorchRotaryEmbedding(torch.nn.Module):
     def _create_sinusoidal_positions(self, device: torch.device) -> torch.Tensor:
         """
         Create the sines and cosines for the RoPE.
         Returns:
             Sinusoidal positions of shape (self.max_seq_len, self.dim).
         """
@@ -756,9 +773,11 @@ class TorchRotaryEmbedding(torch.nn.Module):
     def _rotate_every_two(self, x: torch.Tensor) -> torch.Tensor:
         """
         Prepare a tensor to apply the RoPE mechanism.
         Args:
             x: Tensor of shape (batch_size, seq_len, num_heads, head_dim),
             typically this is the key or query tensor.
         Returns:
             The even indices in the last dimension have their sign flipped.
             Tensor of shape (batch_size, seq_len, num_heads, head_dim).
@@ -775,10 +794,12 @@ class TorchRotaryEmbedding(torch.nn.Module):
     ) -> torch.Tensor:
         """
         Applies rotary embeddings to x.
         Args:
             x: Tensor of shape (batch_size, seq_len, num_heads, head_dim),
             typically this is the key or query tensor.
             sincos: Tuple of sine and cosine tensors for position encoding.
         Returns:
             RoPE embeddings tensor.
         """
@@ -796,10 +817,12 @@ class TorchRotaryEmbedding(torch.nn.Module):
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Applies rotary embeddings to k and q.
         Args:
             k: key tensor of shape (batch_size, seq_len, num_heads, head_dim),
             q: value tensor of shape (batch_size, seq_len, num_heads, head_dim),
             positions: optional positions offset useful when caching,
         Returns:
             RoPE embeddings for the keys and values.
         """
@@ -1117,9 +1140,11 @@ def build_causal_attention_mask(
     """
     Builds a batch of causal masks of shape (batch_size, 1, seq_len, seq_len) to feed
     to an attention layer.
     Args:
         batch_size: Batch size.
         seq_len: Length of the sequences.
     Returns:
         Batch of causal masks.
     """
@@ -1525,11 +1550,13 @@ class TorchNucleotideTransformer(nn.Module):
     ) -> torch.Tensor:
         """
         Computes the embeddings based on the input tokens.
         Args:
             tokens: Input tokens out of the tokenizer of shape (batch_size, seq_len).
             attention_mask: Attention mask of shape (batch_size, 1, seq_len, seq_len).
                 If no mask is provided, a mask by default which equals 1 over all non
                 pad tokens and 0 over pad tokens is computed.
         Returns:
             Dictionary containing the final embeddings and logits.
         """
@@ -1557,9 +1584,11 @@ def build_padding_attention_mask(
 ) -> torch.Tensor:
     """
     Builds a padding mask from a sequence of tokens by masking <pad> in the attention.
     Args:
         tokens: Batch of sequences of shape (batch_size, seq_len).
         pad_token_id: Int corresponding to the <pad> token to mask.
     Returns:
         Batch of attention masks, masking out <pad> tokens.
     """
@@ -1586,6 +1615,7 @@ class TorchBioBrainEncoder(nn.Module):
         Args:
             bio_token_ids (torch.Tensor):
                 Shape (batch_size, num_bio_tokens)
         Returns:
             torch.Tensor:
                 Shape (batch_size, num_bio_tokens, embed_dim)
@@ -1695,6 +1725,7 @@ class TorchMultiModalPerceiverResampler(nn.Module):
     ):
         """
         Initialize a Perceiver Resampler model.
         Args:
             config: Dataclass containing model hyperparameters.
             name: Name for module (custom will break weight loading).
@@ -1823,8 +1854,10 @@ class TorchMultiModalPerceiverResamplerProjection(nn.Module):
         Args:
             bio_token_ids (torch.Tensor):
                 Shape (batch_size, num_bio_tokens)
             bio_embeddings (torch.Tensor):
                 Shape (batch_size, num_bio_tokens, embed_dim)
             english_token_ids (torch.Tensor):
                 Shape (batch_size, num_english_tokens)
         """
@@ -1867,4 +1900,3 @@ def build_perceiver_padding_attention_mask(
     padding_mask = padding_mask[:, None, None, :]
     padding_mask = padding_mask.repeat(1, 1, resampled_length, 1)  # noqa
     return padding_mask

 class PerceiverResamplerConfig:
     """
     Parameters to initialize an PerceiverResampler model.
     Args:
         emb_layer_norm_before: Whether to use layer norm before the first attention
             layer.
 class GptConfig:
     """
     Parameters to initialize a Gpt model.
     NOTE: the pad token is not defined
     Args:
         vocab_size: Token vocabulary.
         eos_token_id: used to stop sentence generation
 class NucleotideTransformerConfig:
     """
     Parameters to initialize an NT model.
     Args:
         alphabet_size: Token vocabulary.
         pad_token_id: ID of pad token.
         """
         Initializes the BioBrain decoder, using a GPT model for text generation with
         bio embeddings.
         Args:
             gpt_config: Configuration for the GPT model
             seq_token_id: Index of the SEQ token
     ) -> torch.Tensor:
         """
         Forward pass through the model.
         Args:
             english_token_ids: Tensor of English token IDs with shape
                 (batch_size, num_english_tokens).
             projected_bio_embeddings: Optional tensor of bio embeddings with shape
                                       (batch_size, num_bio_sequences, ?, embed_dim).
         Returns:
             torch.Tensor: The logits from the GPT model,
                 shaped (batch_size, num_english_tokens, vocab_size).
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Inserts resampled embeddings in input_embeddings, starting at the SEQ token
         Args:
             tokens (torch.Tensor): Shape (batch_size, num_tokens)
             input_embeddings (torch.Tensor): Shape (batch_size, num_tokens, embed_dim)
             resampled_embeddings (torch.Tensor):
                 Shape (batch_size, num_bio_sequences, bio_sequence_length, embed_dim)
         Returns:
             Tuple[torch.Tensor, torch.Tensor]:
                 - input_embeddings with resampled_embeddings inserted at the SEQ token
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Removes the logits corresponding to the unused embeddings.
         Args:
             tokens: Input english tokens.
             logits: Input logits.
         Returns:
             Cleaned logits, last values will be equal to 0.
         """
     def forward(
         self,
+        multi_omics_tokens_ids: tuple[torch.Tensor, torch.Tensor],
         projection_english_tokens_ids: torch.Tensor,
         projected_bio_embeddings: torch.Tensor = None,
     ) -> dict[str, torch.Tensor]:
         """
         Args:
             multi_omics_tokens_ids (Tuple[torch.Tensor, torch.Tensor]):
                 english_tokens_ids: Represents the prompt tokens (english tokens)
                     Shape (batch_size, num_english_tokens)
                 bio_tokens_ids: Represents the bio sequences tokens
                     Shape (batch_size, num_bio_sequences, num_bio_tokens)
             projection_english_tokens_ids (torch.Tensor):
                 Shape (batch_size, num_english_tokens)
             projected_bio_embeddings (projected_bio_embeddings, optional):
                 Shape (batch_size, num_bio_sequencse, ?, embed_dim).
                 Defaults to None.
         Returns:
             dict[str, torch.Tensor] containing:
                 - logits:
                     Shape (batch_size, num_tokens, vocab_size)
                 - projected_bio_embeddings:
                     Shape (batch_size, num_bio_sequences, ?, embed_dim)
         """
         english_token_ids, bio_token_ids = multi_omics_tokens_ids
         english_token_ids = english_token_ids.clone()
+        bio_token_ids = bio_token_ids.clone()
         projection_english_tokens_ids = projection_english_tokens_ids.clone()
         if projected_bio_embeddings is not None:
             projected_bio_embeddings = projected_bio_embeddings.clone()
     def _create_sinusoidal_positions(self, device: torch.device) -> torch.Tensor:
         """
         Create the sines and cosines for the RoPE.
         Returns:
             Sinusoidal positions of shape (self.max_seq_len, self.dim).
         """
     def _rotate_every_two(self, x: torch.Tensor) -> torch.Tensor:
         """
         Prepare a tensor to apply the RoPE mechanism.
         Args:
             x: Tensor of shape (batch_size, seq_len, num_heads, head_dim),
             typically this is the key or query tensor.
         Returns:
             The even indices in the last dimension have their sign flipped.
             Tensor of shape (batch_size, seq_len, num_heads, head_dim).
     ) -> torch.Tensor:
         """
         Applies rotary embeddings to x.
         Args:
             x: Tensor of shape (batch_size, seq_len, num_heads, head_dim),
             typically this is the key or query tensor.
             sincos: Tuple of sine and cosine tensors for position encoding.
         Returns:
             RoPE embeddings tensor.
         """
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Applies rotary embeddings to k and q.
         Args:
             k: key tensor of shape (batch_size, seq_len, num_heads, head_dim),
             q: value tensor of shape (batch_size, seq_len, num_heads, head_dim),
             positions: optional positions offset useful when caching,
         Returns:
             RoPE embeddings for the keys and values.
         """
     """
     Builds a batch of causal masks of shape (batch_size, 1, seq_len, seq_len) to feed
     to an attention layer.
     Args:
         batch_size: Batch size.
         seq_len: Length of the sequences.
     Returns:
         Batch of causal masks.
     """
     ) -> torch.Tensor:
         """
         Computes the embeddings based on the input tokens.
         Args:
             tokens: Input tokens out of the tokenizer of shape (batch_size, seq_len).
             attention_mask: Attention mask of shape (batch_size, 1, seq_len, seq_len).
                 If no mask is provided, a mask by default which equals 1 over all non
                 pad tokens and 0 over pad tokens is computed.
         Returns:
             Dictionary containing the final embeddings and logits.
         """
 ) -> torch.Tensor:
     """
     Builds a padding mask from a sequence of tokens by masking <pad> in the attention.
     Args:
         tokens: Batch of sequences of shape (batch_size, seq_len).
         pad_token_id: Int corresponding to the <pad> token to mask.
     Returns:
         Batch of attention masks, masking out <pad> tokens.
     """
         Args:
             bio_token_ids (torch.Tensor):
                 Shape (batch_size, num_bio_tokens)
         Returns:
             torch.Tensor:
                 Shape (batch_size, num_bio_tokens, embed_dim)
     ):
         """
         Initialize a Perceiver Resampler model.
         Args:
             config: Dataclass containing model hyperparameters.
             name: Name for module (custom will break weight loading).
         Args:
             bio_token_ids (torch.Tensor):
                 Shape (batch_size, num_bio_tokens)
             bio_embeddings (torch.Tensor):
                 Shape (batch_size, num_bio_tokens, embed_dim)
             english_token_ids (torch.Tensor):
                 Shape (batch_size, num_english_tokens)
         """
     padding_mask = padding_mask[:, None, None, :]
     padding_mask = padding_mask.repeat(1, 1, resampled_length, 1)  # noqa
     return padding_mask

text_generation.py CHANGED Viewed

@@ -55,24 +55,19 @@ class TextGenerationPipeline(Pipeline):
             truncation=True,
             max_length=english_tokens_max_length,
         ).input_ids
-        if len(dna_sequences) == 0:
-            bio_tokens = None
-        else:
-            bio_tokens = self.bio_tokenizer(
-                dna_sequences,
-                return_tensors="pt",
-                padding="max_length",
-                max_length=bio_tokens_max_length,
-                truncation=True,
-            ).input_ids.unsqueeze(0)
         return {"english_tokens": english_tokens, "bio_tokens": bio_tokens}
     def _forward(self, model_inputs: dict, max_num_tokens_to_decode: int = 50) -> dict:
         english_tokens = model_inputs["english_tokens"].clone()
-        bio_tokens = model_inputs["bio_tokens"]
-        if bio_tokens is not None:
-            bio_tokens = bio_tokens.clone()
         projected_bio_embeddings = None
         actual_num_steps = 0

             truncation=True,
             max_length=english_tokens_max_length,
         ).input_ids
+        bio_tokens = self.bio_tokenizer(
+            dna_sequences,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=bio_tokens_max_length,
+            truncation=True,
+        ).input_ids.unsqueeze(0)
         return {"english_tokens": english_tokens, "bio_tokens": bio_tokens}
     def _forward(self, model_inputs: dict, max_num_tokens_to_decode: int = 50) -> dict:
         english_tokens = model_inputs["english_tokens"].clone()
+        bio_tokens = model_inputs["bio_tokens"].clone()
         projected_bio_embeddings = None
         actual_num_steps = 0