presightai
/

arabic-image-to-markdown-qwen2.5vl-7b-instruct-lora

PEFT

Safetensors

Model card Files Files and versions

xet

Community

parikshitmukh commited on May 29

Commit

84edf53

verified ·

1 Parent(s): e35dd3c

Update README.md

Browse files

Files changed (1) hide show

README.md +126 -22

README.md CHANGED Viewed

@@ -112,32 +112,136 @@ We **recommend** human review in all production deployments.
 ---
 ##  How to Get Started
 ###  Inference Example
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-from PIL import Image
 import torch
-# Load model + adapter + tokenizer
-base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-adapter = PeftModel.from_pretrained(base_model, "presightai/arabic-image-to-markdown-qwen2.5vl-7b-instruct-lora")
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-# Load your image (replace 'input.jpg' with your image file)
-image = Image.open("input.jpg").convert("RGB")
-# Prepare prompt
-prompt = "Extract in markdown <image>"
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-# Add the image argument here!
-outputs = adapter.generate(**inputs, images=[image])
-# Decode the output
-decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(decoded)
 ```
 ## Inference API and Smart Chunking to handle large token size

 ---
 ##  How to Get Started
+#### Install requirements
+accelerate==1.7.0
+av==14.4.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+einops==0.8.1
+filelock==3.18.0
+flash_attn==2.7.4.post1
+fsspec==2025.5.1
+hf-xet==1.1.2
+huggingface-hub==0.32.2
+idna==3.10
+Jinja2==3.1.6
+MarkupSafe==3.0.2
+mpmath==1.3.0
+networkx==3.4.2
+ninja==1.11.1.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+optimum==1.25.3
+packaging==25.0
+peft==0.14.0
+pillow==11.2.1
+psutil==7.0.0
+PyYAML==6.0.2
+qwen-vl-utils==0.0.8
+regex==2024.11.6
+requests==2.32.3
+safetensors==0.5.3
+sympy==1.13.1
+tokenizers==0.21.1
+torch==2.6.0
+torchvision==0.21.0
+tqdm==4.67.1
+transformers==4.49.0
+triton==3.2.0
+typing_extensions==4.13.2
+urllib3==2.4.0
 ###  Inference Example
 ```python
+import argparse
 import torch
+from PIL import Image
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from qwen_vl_utils import process_vision_info
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # or "3"
+def load_model_with_qlora(model_repo):
+    print(f"Loading model from: {model_repo}")
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_repo,
+        attn_implementation="flash_attention_2",
+        device_map="auto",
+        torch_dtype=torch.float16,
+        use_cache=True
+    )
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", use_fast=True)
+    print("Model and processor loaded.")
+    return model, processor
+@torch.no_grad()
+def inference(model, processor, image_path, question):
+    image = Image.open(image_path).convert("RGB")
+    # Build input sample
+    example = {
+        'content': [
+            {'type': 'image', 'image': image},
+            {'type': 'text', 'text': question}
+        ]
+    }
+    # Process image + prompt separately
+    image_input = process_vision_info([example])[0]
+    text_input = processor.apply_chat_template([example])
+    inputs = processor(
+        text=text_input,
+        images=image_input,
+        return_tensors="pt",
+        padding=True
+    ).to(model.device)
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=1024,
+        do_sample=False,
+        pad_token_id=processor.tokenizer.pad_token_id,
+        eos_token_id=processor.tokenizer.eos_token_id,
+        use_cache=True,
+        num_beams=1
+    )
+    decoded = processor.decode(outputs[0], skip_special_tokens=True)
+    return decoded.strip()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Qwen2.5-VL Inference Script")
+    parser.add_argument("--model_repo", type=str, required=True, help="Hugging Face repo, e.g., presightai/arabic-image-to-markdown-qwen2.5vl-7b-instruct-lora")
+    parser.add_argument("--image_path", type=str, required=True, help="Path to input image (e.g., input.jpg)")
+    parser.add_argument("--question", type=str, default="Extract the content in markdown format.", help="Question/prompt")
+    args = parser.parse_args()
+    model, processor = load_model_with_qlora(args.model_repo)
+    result = inference(model, processor, args.image_path, args.question)
+    print("\n=== Model Output ===")
+    print(result)
+```
+## Command
+```bash
+   python3 arabic_ocr.py --model_repo presightai/arabic-image-to-markdown-qwen2.5vl-7b-instruct-lora   --image_path input.jpg
 ```
 ## Inference API and Smart Chunking to handle large token size