oumi-ai
/

Molmo-7B-O-0924

@@ -2013,6 +2013,7 @@ class MolmoForCausalLM(PreTrainedModel):
     config_class = MolmoConfig
     base_model_prefix = "model"
     _no_split_modules = ["MolmoBlock"]
     def __init__(self, config: MolmoConfig, model: Optional[Molmo] = None, init_params: bool = False):
         super().__init__(config)
@@ -2150,6 +2151,8 @@ class MolmoForCausalLM(PreTrainedModel):
                 # Shift so that tokens < n predict n
                 shift_logits = logits[..., :-1, :].contiguous()
                 shift_labels = labels[..., 1:].contiguous()
                 # Flatten the tokens
                 loss_fct = torch.nn.CrossEntropyLoss()
                 shift_logits = shift_logits.view(-1, self.config.embedding_size)

     config_class = MolmoConfig
     base_model_prefix = "model"
     _no_split_modules = ["MolmoBlock"]
+    _tp_plan = {}
     def __init__(self, config: MolmoConfig, model: Optional[Molmo] = None, init_params: bool = False):
         super().__init__(config)
                 # Shift so that tokens < n predict n
                 shift_logits = logits[..., :-1, :].contiguous()
                 shift_labels = labels[..., 1:].contiguous()
+                # Ignore image tokens
+                shift_labels = torch.where(shift_labels >= 152064, torch.tensor(-100, device=shift_labels.device), shift_labels)
                 # Flatten the tokens
                 loss_fct = torch.nn.CrossEntropyLoss()
                 shift_logits = shift_logits.view(-1, self.config.embedding_size)

preprocessing_molmo.py CHANGED Viewed

@@ -183,10 +183,16 @@ class MolmoProcessor(ProcessorMixin):
             image_input_idx = out["image_input_idx"]
             out["image_input_idx"] = np.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)
         for k, v in out.items():
             out[k] = torch.from_numpy(v)
         return out
 MolmoProcessor.register_for_auto_class()

             image_input_idx = out["image_input_idx"]
             out["image_input_idx"] = np.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)
+        # Add attention mask for training
+        out["attention_mask"] = np.ones_like(decoder_input_tokens)
         for k, v in out.items():
             out[k] = torch.from_numpy(v)
         return out
+    def __call__(self, *args, **kwargs):
+        return self.process(*args, **kwargs)
 MolmoProcessor.register_for_auto_class()

tokenizer_config.json CHANGED Viewed

@@ -229,7 +229,7 @@
     "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
   },
   "bos_token": "<|endoftext|>",
-  "chat_template": "{% for message in messages -%}\n        {%- if (loop.index % 2 == 1 and message['role'] != 'user') or \n          (loop.index % 2 == 0 and message['role'].lower() != 'assistant') -%}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n        {%- endif -%}\n        {{ message['role'].capitalize() + ': ' + message['content'] }}\n        {%- if not loop.last -%}\n        {{ ' ' }}\n        {%- endif %}\n        {%- endfor -%}\n        {%- if add_generation_prompt -%}\n        {{ ' Assistant:' }}\n        {%- endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "model_max_length": 8192,

     "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
   },
   "bos_token": "<|endoftext|>",
+  "chat_template": "{% for message in messages -%}\n        {%- if (loop.index % 2 == 1 and message['role'].lower() != 'user') or \n          (loop.index % 2 == 0 and message['role'].lower() != 'assistant') -%}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n        {%- endif -%}\n        {{ message['role'].capitalize() + ': ' + message['content'] }}\n        {%- if not loop.last -%}\n        {{ ' ' }}\n        {%- endif %}\n        {%- endfor -%}\n        {%- if add_generation_prompt -%}\n        {{ ' Assistant:' }}\n        {%- endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "model_max_length": 8192,