shilinxu
/

Qwen2-VL-7B-ViT

Model card Files Files and versions

shilinxu commited on Jul 24

Commit

6843f07

·

verified ·

1 Parent(s): a7c0ad8

Update modeling_qwen2_vl.py

Files changed (1) hide show

modeling_qwen2_vl.py +3 -3

modeling_qwen2_vl.py CHANGED Viewed

@@ -395,13 +395,13 @@ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
         return rotary_pos_emb
     @auto_docstring
-    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
         r"""
         grid_thw (`torch.LongTensor` of shape `(num_images, 3)`):
             The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values.
         """
-        hidden_states = self.patch_embed(hidden_states)
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
         emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
         position_embeddings = (emb.cos(), emb.sin())

         return rotary_pos_emb
     @auto_docstring
+    def forward(self, pixel_values: torch.Tensor, image_grid_thw: torch.Tensor) -> torch.Tensor:
         r"""
         grid_thw (`torch.LongTensor` of shape `(num_images, 3)`):
             The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values.
         """
+        hidden_states = self.patch_embed(pixel_values)
+        rotary_pos_emb = self.rot_pos_emb(image_grid_thw)
         emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
         position_embeddings = (emb.cos(), emb.sin())