Spaces:

Vishal1122
/

OCR_processor

Sleeping

App Files Files Community

Vishal1122 commited on Jun 10

Commit

bff705e

verified ·

1 Parent(s): 6d2a09a

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -37

app.py CHANGED Viewed

@@ -8,7 +8,9 @@ from typing import Dict
 import gradio as gr
 import ollama
 from PIL import Image
 def save_temp_image(image: Image.Image) -> str:
     """
@@ -27,45 +29,87 @@ def save_temp_image(image: Image.Image) -> str:
         return tmp_file.name
 def id_extractor(image: Image.Image) -> Dict:
-    """
-    Extracts key details from the provided image using the ollama chat model.
-    Args:
-        image (Image.Image): The image from which to extract details.
-    Returns:
-        Dict: A dictionary containing the extracted details.
-        If the image is None or an error occurs, returns an empty dictionary.
-    """
-    try:
-        error_trace = None
-        if image is None:
-            # Return empty dictionary and make the output invisible
-            return {}, gr.update(visible=False)
-        # Save the image temporarily
-        image_path = save_temp_image(image)
-        # Send the image to the ollama chat model for processing
-        response = ollama.chat(
-            model='qwen2.5vl:7b',
-            messages=[{
-                'role': 'user',
-                'content': "Extract key details like 'name', 'date of birth', 'ID number', 'Issuer' from the image as JSON, excluding signatures.",
-                'images': [image_path]
-            }]
-        )
-        # Clean up the response content
-        resp = response.message.content.replace("```json", "").replace("```", "").strip()
-        return json.loads(resp)
-    except json.JSONDecodeError as e:
-        # Capture and print the error traceback
-        error_trace = traceback.format_exc()
-        print(error_trace)
-        return "Kindly upload an image with good clarity"
 # Define the Gradio interface for the ID extractor
 id_interface = gr.Interface(

 import gradio as gr
 import ollama
 from PIL import Image
+from qwen_vl_utils import process_vision_info
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 def save_temp_image(image: Image.Image) -> str:
     """
         return tmp_file.name
 def id_extractor(image: Image.Image) -> Dict:
+    # default: Load the model on the available device(s)
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
+    )
+    # default processer
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image,
+                },
+                {"type": "text", "text": "Extract key details like 'name', 'date of birth', 'ID number', 'Issuer' from the image as JSON, excluding signatures. Note if a ID has two names, pick the first one."},
+            ],
+        }
+    ]
+    # Preparation for inference
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, max_new_tokens=128)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text
 # Define the Gradio interface for the ID extractor
 id_interface = gr.Interface(