Initial commit of MMNLI model with LFS

Browse files

Files changed (5) hide show

.gitattributes +1 -0
README.md +146 -0
config.json +15 -0
model.stateforce +3 -0
modeling_mmnli.py +98 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.stateforce filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,146 @@

+# Multilingual & Multimodal NLI (MMNLI)
+This repository provides the **MMNLI model**, a multilingual and multimodal Natural Language Inference classifier.
+It extends the BLASER architecture into **multiclass NLI**, supporting entailment, contradiction, and neutrality across text-text, text-speech, speech-text, and speech-speech input pairs.
+The model is trained on the [oist/multimodal_nli_dataset](https://huggingface.co/datasets/oist/multimodal_nli_dataset).
+Please refer to that dataset card for details.
+---
+## Usage
+The model depends on **SONAR embeddings**. You can use the official SONAR encoders (for text and speech) or the **ported SONAR text encoder** [`cointegrated/SONAR_200_text_encoder`](https://huggingface.co/cointegrated/SONAR_200_text_encoder).
+---
+### Example 1: Speech–Text Inference
+```python
+import torch
+from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline
+from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
+from transformers import AutoModel
+# 1. Load SONAR encoders
+speech_encoder = SpeechToEmbeddingModelPipeline(encoder="sonar_speech_encoder_eng")
+text_encoder = TextToEmbeddingModelPipeline(encoder="text_sonar_basic_encoder", tokenizer="text_sonar_basic_encoder")
+# 2. Encode premise (speech) and hypothesis (text)
+premise_embs = speech_encoder.predict(["audio.wav"])
+hypothesis_embs = text_encoder.predict(["The cat sat on the mat."], source_lang="eng_Latn")
+# 3. Load MMNLI model
+mmnli_model_name = "oist/multimodal_nli_model"
+mmnli_model = AutoModel.from_pretrained(mmnli_model_name, trust_remote_code=True)
+mmnli_model.eval()
+# 4. Run inference
+with torch.inference_mode():
+    logits = mmnli_model(premise_embs, hypothesis_embs)  # returns [batch_size, 3]
+    pred_class = torch.argmax(logits, dim=-1).item()
+print("Prediction:", pred_class)
+# 0 = Entailment, 1 = Neutral, 2 = Contradiction
+```
+### Example 2: Text–Text Inference (Official SONAR)
+```python
+import torch
+from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
+from transformers import AutoModel
+# 1. Load official SONAR text encoder
+text_encoder = TextToEmbeddingModelPipeline(
+    encoder="text_sonar_basic_encoder",
+    tokenizer="text_sonar_basic_encoder"
+)
+# 2. Encode premise and hypothesis
+premise_texts = ["Le chat s'assit sur le tapis."]
+hypothesis_texts = ["The cat sat on the mat."]
+premise_embs = text_encoder.predict(premise_texts, source_lang="fra_Latn")
+hypothesis_embs = text_encoder.predict(hypothesis_texts, source_lang="eng_Latn")
+# 3. Load MMNLI model
+mmnli_model = AutoModel.from_pretrained("oist/multimodal_nli_model", trust_remote_code=True)
+mmnli_model.eval()
+# 4. Run inference
+with torch.inference_mode():
+    logits = mmnli_model(premise_embs, hypothesis_embs)
+    pred_class = torch.argmax(logits, dim=-1).item()
+print("Prediction:", pred_class)
+# 0 = Entailment, 1 = Neutral, 2 = Contradiction
+```
+### Example 3: Text–Text Inference (Ported SONAR)
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder
+# 1. Load ported SONAR text encoder
+sonar_model_name = "cointegrated/SONAR_200_text_encoder"
+encoder = M2M100Encoder.from_pretrained(sonar_model_name)
+tokenizer = AutoTokenizer.from_pretrained(sonar_model_name)
+def encode_mean_pool(texts, tokenizer, encoder, lang='eng_Latn', norm=False):
+    tokenizer.src_lang = lang
+    with torch.inference_mode():
+        batch = tokenizer(texts, return_tensors='pt', padding=True)
+        seq_embs = encoder(**batch).last_hidden_state
+        mask = batch.attention_mask
+        mean_emb = (seq_embs * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1)
+        if norm:
+            mean_emb = torch.nn.functional.normalize(mean_emb)
+    return mean_emb
+# Example sentences
+premise_sentences = ["Le chat s'assit sur le tapis."]
+hypothesis_sentences = ["The cat sat on the mat."]
+# 2. Encode premise and hypothesis
+premise_embs = encode_mean_pool(premise_sentences, tokenizer, encoder, lang="fra_Latn")
+hypothesis_embs = encode_mean_pool(hypothesis_sentences, tokenizer, encoder, lang="eng_Latn")
+mmnli_model_name = "oist/multimodal_nli_model"
+mmnli_model = AutoModel.from_pretrained(mmnli_model_name, trust_remote_code=True)
+mmnli_model.eval()
+# 4. Run inference
+with torch.inference_mode():
+    logits = mmnli_model(premise_embs, hypothesis_embs)  # returns [batch_size, 3]
+    pred_class = torch.argmax(logits, dim=-1).item()
+print("Prediction:", pred_class)
+# 0 = Entailment, 1 = Neutral, 2 = Contradiction
+```
+---
+## Labels
+- 0 = Entailment
+- 1 = Neutral
+- 2 = Contradiction
+---
+## Citation
+If you use this model, please cite:
+```bibtex
+@inproceedings{istaiteh2025beyond,
+  title={Beyond Similarity Scoring: Detecting Entailment and Contradiction in Multilingual and Multimodal Contexts},
+  author={Istaiteh, Othman and Mdhaffar, Salima and Est{\`e}ve, Yannick},
+  booktitle={Proc. Interspeech 2025},
+  pages={286--290},
+  year={2025}
+}

config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "activation": "TANH",
+    "architectures": ["MMNLIModel"],
+    "dropout": 0.1,
+    "embedding_dim": 1024,
+    "hidden_dims": [3072, 1536],
+    "model_type": "mmnli",
+    "norm_emb": true,
+    "output_dim": 3,
+    "transformers_version": "4.56.1",
+    "auto_map": {
+      "AutoConfig": "modeling_mmnli.MMNLIConfig",
+      "AutoModel": "modeling_mmnli.MMNLIModel"
+    }
+  }

model.stateforce ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1b0f69053bbbb0e4b1a4577014eda15b94030c506dbc212726cf2919128751d
+size 69245364

modeling_mmnli.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Optional
+from torch import Tensor
+from transformers import PretrainedConfig, PreTrainedModel
+# ---------------- CONFIG ---------------- #
+class MMNLIConfig(PretrainedConfig):
+    model_type = "mmnli"
+    def __init__(
+        self,
+        embedding_dim: int = 1024,
+        hidden_dims: Optional[List[int]] = None,
+        dropout: float = 0.1,
+        activation: str = "TANH",
+        norm_emb: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embedding_dim = embedding_dim
+        self.hidden_dims = hidden_dims if hidden_dims is not None else [3072, 1536]
+        self.dropout = dropout
+        self.activation = activation
+        self.norm_emb = norm_emb
+        self.output_dim = 3  # entailment, contradiction, neutral
+# ---------------- CORE MODEL ---------------- #
+ACTIVATIONS = {"TANH": nn.Tanh, "RELU": nn.ReLU}
+class MMNLICore(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        hidden_dims: List[int],
+        dropout: float,
+        activation: str,
+        norm_emb: bool,
+    ):
+        super().__init__()
+        self.norm_emb = norm_emb
+        if activation not in ACTIVATIONS:
+            raise ValueError(f"Unrecognized activation: {activation}")
+        # Input: concatenation of [p, h, p*h, |p-h|] => 4 * embedding_dim
+        input_dim = embedding_dim * 4
+        modules: List[nn.Module] = []
+        if dropout > 0:
+            modules.append(nn.Dropout(p=dropout))
+        nprev = input_dim
+        for h in hidden_dims:
+            modules.append(nn.Linear(nprev, h))
+            modules.append(ACTIVATIONS[activation]())
+            if dropout > 0:
+                modules.append(nn.Dropout(p=dropout))
+            nprev = h
+        # Final classifier layer: 3-way softmax
+        modules.append(nn.Linear(nprev, 3))
+        modules.append(nn.Softmax(dim=-1))
+        self.mlp = nn.Sequential(*modules)
+    def _norm(self, emb: Optional[Tensor]) -> Optional[Tensor]:
+        return F.normalize(emb) if (emb is not None and self.norm_emb) else emb
+    def featurize(self, premise: Tensor, hypothesis: Tensor) -> Tensor:
+        return torch.cat(
+            [premise, hypothesis, premise * hypothesis, torch.abs(premise - hypothesis)],
+            dim=-1,
+        )
+# ---------------- HF MODEL WRAPPER ---------------- #
+class MMNLIModel(PreTrainedModel):
+    config_class = MMNLIConfig
+    def __init__(self, config: MMNLIConfig):
+        super().__init__(config)
+        self.core = MMNLICore(
+            embedding_dim=config.embedding_dim,
+            hidden_dims=config.hidden_dims,
+            dropout=config.dropout,
+            activation=config.activation,
+            norm_emb=config.norm_emb,
+        )
+    def forward(self, premise: Tensor, hypothesis: Tensor):
+        premise = self.core._norm(premise)
+        hypothesis = self.core._norm(hypothesis)
+        proc = self.core.featurize(premise, hypothesis)
+        return self.core.mlp(proc)