OEvortex
/

HelpingAI-Vision

@@ -13,12 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Processor class for Llava.
 """
 from typing import List, Optional, Union
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import (
@@ -28,52 +32,73 @@ from transformers.tokenization_utils_base import (
     TruncationStrategy,
 )
 from transformers.utils import TensorType
-import torch
-from open_clip.transform import PreprocessCfg, image_transform_v2
-from modeling_llava import LlavaForConditionalGeneration
-from PIL import Image
-import math
-class OpenCLIPImageProcessor:
-    def __init__(self, config, crop_size=384, max_tokens=100):
-        cfg = PreprocessCfg(**config)
-        transform = image_transform_v2(cfg=cfg, is_train=False)
-        self.transform = transform
-        self.crop_size = crop_size
-        self.max_tokens = max_tokens
-    def __call__(self, image: Image.Image):
-        output = self.transform_func(image)
-        return {
-            "pixel_values": output,
         }
-    def transform_func(self, image: Image.Image):
         outputs = []
-        outputs.append(self.transform(image))
         width, height = image.size
         crop_size = self.crop_size
-        if width <= crop_size and height <= crop_size:
-            outputs = torch.stack(outputs, dim=0)
-            return outputs
         total_tokens = math.inf
-        while total_tokens > self.max_tokens:
-            total_tokens = math.floor(
-                (2 * width - crop_size)
-                / crop_size
-                * (2 * height - crop_size)
-                / crop_size
             )
-            if total_tokens > self.max_tokens:
                 crop_size += 10
-        stride = crop_size // 2
-        x_steps = int(round((2 * width - crop_size) / crop_size))
         if x_steps < 1:
             x_steps = 1
-        y_steps = int(round((2 * height - crop_size) / crop_size))
         if y_steps < 1:
             y_steps = 1
         x_coords = []
         y_coords = []
         for i in range(x_steps):
@@ -85,6 +110,7 @@ class OpenCLIPImageProcessor:
         if y_coords[-1][1] != height:
             y_coords[-1][1] = height
         image_parts = []
         for i in range(len(x_coords)):
             for j in range(len(y_coords)):
                 image_parts.append(
@@ -92,20 +118,38 @@ class OpenCLIPImageProcessor:
                         (x_coords[i][0], y_coords[j][0], x_coords[i][1], y_coords[j][1])
                     )
                 )
         for image_part in image_parts:
-            outputs.append(self.transform(image_part))
-        outputs = torch.stack(outputs, dim=0)
-        return outputs
-    @property
-    def model_input_names(self):
-        return ["pixel_values"]
-class LlavaProcessor:
-    def __init__(self, image_processor: OpenCLIPImageProcessor, tokenizer):
         self.image_processor = image_processor
         self.tokenizer = tokenizer
     def __call__(
         self,
@@ -113,20 +157,24 @@ class LlavaProcessor:
             TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
         ] = None,
         images: ImageInput = None,
-        model: LlavaForConditionalGeneration = None,
         padding: Union[bool, str, PaddingStrategy] = False,
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_length=None,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchFeature:
         if images is not None:
-            pixel_values = self.image_processor(images)[
-                "pixel_values"
             ]
-            pixel_values = pixel_values.to(model.device).to(model.dtype)
-            image_outputs = model.vision_model(pixel_values)
             image_features = model.multi_modal_projector(image_outputs)
-            image_features = image_features.unsqueeze(0)
         else:
             image_features = None
         text_inputs = self.tokenizer(
@@ -136,7 +184,8 @@ class LlavaProcessor:
             truncation=truncation,
             max_length=max_length,
         )
         return BatchFeature(data={**text_inputs, "image_features": image_features})
     def batch_decode(self, *args, **kwargs):
@@ -150,3 +199,4 @@ class LlavaProcessor:
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+Processor class for HelpingAI-V.
 """
+import math
 from typing import List, Optional, Union
+import torch
+from PIL import Image
+from transformers import ImageProcessingMixin, ProcessorMixin, SiglipImageProcessor, AutoTokenizer, AutoImageProcessor
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import (
     TruncationStrategy,
 )
 from transformers.utils import TensorType
+class MultiCropImageProcessor(ImageProcessingMixin):
+    def __init__(self, model_name, max_crops=0, **kwargs):
+        self.processor = SiglipImageProcessor.from_pretrained(model_name)
+        self.crop_size = 384
+        self.max_crops = max_crops
+        self.stride_ratio = 2
+    def __call__(
+        self,
+        images: List[Image.Image],
+        max_crops: int = -1,
+    ):
+        res = {
+            "pixel_values": [],
+            "coords": [],
         }
+        if max_crops < 0:
+            max_crops = self.max_crops
+        for image in images:
+            outputs, output_coords = self.process_image(image, max_crops)
+            res["pixel_values"].append(outputs)
+            res["coords"].append(output_coords)
+            return res
+    def process_image(
+        self,
+        image: Image.Image,
+        max_crops: int
+    ):
         outputs = []
+        output_coords = []
+        outputs.append(self.processor(image, return_tensors="pt").pixel_values)
+        output_coords.append(torch.tensor([0.5, 0.5]))
         width, height = image.size
         crop_size = self.crop_size
+        stride = crop_size // self.stride_ratio
+        if (
+            max_crops == 0
+            or width <= (crop_size + stride)
+            and height <= (crop_size + stride)
+        ):
+            outputs = torch.cat(outputs, dim=0)
+            output_coords = torch.cat(output_coords, dim=0)
+            return outputs, output_coords
         total_tokens = math.inf
+        while total_tokens > max_crops:
+            total_tokens = (
+                math.floor((width - crop_size) / stride) + 1
+            ) * (
+                math.floor((height - crop_size) / stride) + 1
             )
+            if total_tokens > max_crops:
                 crop_size += 10
+                stride = crop_size // self.stride_ratio
+        stride = crop_size // self.stride_ratio
+        x_steps = int(math.floor((width - crop_size) / stride) + 1)
         if x_steps < 1:
             x_steps = 1
+        y_steps = int(math.floor((height - crop_size) / stride) + 1)
         if y_steps < 1:
             y_steps = 1
+        if x_steps == 1 and y_steps == 1:
+            outputs = torch.cat(outputs, dim=0)
+            output_coords = torch.cat(output_coords, dim=0)
+            return outputs, output_coords
         x_coords = []
         y_coords = []
         for i in range(x_steps):
         if y_coords[-1][1] != height:
             y_coords[-1][1] = height
         image_parts = []
+        part_coords = []
         for i in range(len(x_coords)):
             for j in range(len(y_coords)):
                 image_parts.append(
                         (x_coords[i][0], y_coords[j][0], x_coords[i][1], y_coords[j][1])
                     )
                 )
+                part_coords.append(
+                    torch.tensor(
+                        [
+                            (x_coords[i][0] + x_coords[i][1]) / 2 / width,
+                            (y_coords[j][0] + y_coords[j][1]) / 2 / height,
+                        ]
+                    )
+                )
         for image_part in image_parts:
+            outputs.append(self.processor(image_part, return_tensors="pt").pixel_values)
+        for part_coord in part_coords:
+            output_coords.append(part_coord)
+        outputs = torch.cat(outputs, dim=0)
+        output_coords = torch.stack(output_coords, dim=0)
+        return outputs, output_coords
+class LlavaProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = MultiCropImageProcessor
+    tokenizer_class = "SiglipTokenizer"
+    def __init__(self, image_processor: MultiCropImageProcessor, tokenizer):
         self.image_processor = image_processor
         self.tokenizer = tokenizer
+        self.search_model = None
+    @classmethod
+    def from_pretrained(cls, path, trust_remote_code=True, **kwargs):
+        tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=trust_remote_code)
+        image_processor = MultiCropImageProcessor(path, trust_remote_code=trust_remote_code)
+        return LlavaProcessor(image_processor, tokenizer)
     def __call__(
         self,
             TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
         ] = None,
         images: ImageInput = None,
+        model = None,
+        max_crops: int = 0,
+        num_tokens = None,
         padding: Union[bool, str, PaddingStrategy] = False,
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_length=None,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchFeature:
         if images is not None:
+            processor_outputs = self.image_processor(images, max_crops)
+            pixel_values = processor_outputs["pixel_values"]
+            pixel_values = [
+                value.to(model.device).to(model.dtype) for value in pixel_values
             ]
+            coords = processor_outputs["coords"]
+            coords = [value.to(model.device).to(model.dtype) for value in coords]
+            image_outputs = model.vision_model(pixel_values, coords, num_tokens)
             image_features = model.multi_modal_projector(image_outputs)
         else:
             image_features = None
         text_inputs = self.tokenizer(
             truncation=truncation,
             max_length=max_length,
         )
+        text_inputs['input_ids'] = text_inputs['input_ids'].to(model.device)
+        text_inputs['attention_mask'] = text_inputs['attention_mask'].to(model.device)
         return BatchFeature(data={**text_inputs, "image_features": image_features})
     def batch_decode(self, *args, **kwargs):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))