PEFT
Safetensors
parikshitmukh commited on
Commit
84edf53
·
verified ·
1 Parent(s): e35dd3c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +126 -22
README.md CHANGED
@@ -112,32 +112,136 @@ We **recommend** human review in all production deployments.
112
  ---
113
  ## How to Get Started
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  ### Inference Example
116
 
117
  ```python
118
- from transformers import AutoModelForCausalLM, AutoTokenizer
119
- from peft import PeftModel
120
- from PIL import Image
121
  import torch
122
-
123
- # Load model + adapter + tokenizer
124
- base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
125
- adapter = PeftModel.from_pretrained(base_model, "presightai/arabic-image-to-markdown-qwen2.5vl-7b-instruct-lora")
126
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
127
-
128
- # Load your image (replace 'input.jpg' with your image file)
129
- image = Image.open("input.jpg").convert("RGB")
130
-
131
- # Prepare prompt
132
- prompt = "Extract in markdown <image>"
133
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
134
-
135
- # Add the image argument here!
136
- outputs = adapter.generate(**inputs, images=[image])
137
-
138
- # Decode the output
139
- decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
140
- print(decoded)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  ```
142
 
143
  ## Inference API and Smart Chunking to handle large token size
 
112
  ---
113
  ## How to Get Started
114
 
115
+ #### Install requirements
116
+ accelerate==1.7.0
117
+ av==14.4.0
118
+ certifi==2025.4.26
119
+ charset-normalizer==3.4.2
120
+ einops==0.8.1
121
+ filelock==3.18.0
122
+ flash_attn==2.7.4.post1
123
+ fsspec==2025.5.1
124
+ hf-xet==1.1.2
125
+ huggingface-hub==0.32.2
126
+ idna==3.10
127
+ Jinja2==3.1.6
128
+ MarkupSafe==3.0.2
129
+ mpmath==1.3.0
130
+ networkx==3.4.2
131
+ ninja==1.11.1.4
132
+ numpy==1.26.4
133
+ nvidia-cublas-cu12==12.4.5.8
134
+ nvidia-cuda-cupti-cu12==12.4.127
135
+ nvidia-cuda-nvrtc-cu12==12.4.127
136
+ nvidia-cuda-runtime-cu12==12.4.127
137
+ nvidia-cudnn-cu12==9.1.0.70
138
+ nvidia-cufft-cu12==11.2.1.3
139
+ nvidia-curand-cu12==10.3.5.147
140
+ nvidia-cusolver-cu12==11.6.1.9
141
+ nvidia-cusparse-cu12==12.3.1.170
142
+ nvidia-cusparselt-cu12==0.6.2
143
+ nvidia-nccl-cu12==2.21.5
144
+ nvidia-nvjitlink-cu12==12.4.127
145
+ nvidia-nvtx-cu12==12.4.127
146
+ optimum==1.25.3
147
+ packaging==25.0
148
+ peft==0.14.0
149
+ pillow==11.2.1
150
+ psutil==7.0.0
151
+ PyYAML==6.0.2
152
+ qwen-vl-utils==0.0.8
153
+ regex==2024.11.6
154
+ requests==2.32.3
155
+ safetensors==0.5.3
156
+ sympy==1.13.1
157
+ tokenizers==0.21.1
158
+ torch==2.6.0
159
+ torchvision==0.21.0
160
+ tqdm==4.67.1
161
+ transformers==4.49.0
162
+ triton==3.2.0
163
+ typing_extensions==4.13.2
164
+ urllib3==2.4.0
165
+
166
+
167
  ### Inference Example
168
 
169
  ```python
170
+ import argparse
 
 
171
  import torch
172
+ from PIL import Image
173
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
174
+ from qwen_vl_utils import process_vision_info
175
+
176
+ import os
177
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1" # or "3"
178
+
179
+ def load_model_with_qlora(model_repo):
180
+ print(f"Loading model from: {model_repo}")
181
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
182
+ model_repo,
183
+ attn_implementation="flash_attention_2",
184
+ device_map="auto",
185
+ torch_dtype=torch.float16,
186
+ use_cache=True
187
+ )
188
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", use_fast=True)
189
+ print("Model and processor loaded.")
190
+ return model, processor
191
+
192
+ @torch.no_grad()
193
+ def inference(model, processor, image_path, question):
194
+ image = Image.open(image_path).convert("RGB")
195
+
196
+ # Build input sample
197
+ example = {
198
+ 'content': [
199
+ {'type': 'image', 'image': image},
200
+ {'type': 'text', 'text': question}
201
+ ]
202
+ }
203
+
204
+ # Process image + prompt separately
205
+ image_input = process_vision_info([example])[0]
206
+ text_input = processor.apply_chat_template([example])
207
+
208
+ inputs = processor(
209
+ text=text_input,
210
+ images=image_input,
211
+ return_tensors="pt",
212
+ padding=True
213
+ ).to(model.device)
214
+
215
+ outputs = model.generate(
216
+ **inputs,
217
+ max_new_tokens=1024,
218
+ do_sample=False,
219
+ pad_token_id=processor.tokenizer.pad_token_id,
220
+ eos_token_id=processor.tokenizer.eos_token_id,
221
+ use_cache=True,
222
+ num_beams=1
223
+ )
224
+
225
+ decoded = processor.decode(outputs[0], skip_special_tokens=True)
226
+ return decoded.strip()
227
+
228
+ if __name__ == "__main__":
229
+ parser = argparse.ArgumentParser(description="Qwen2.5-VL Inference Script")
230
+ parser.add_argument("--model_repo", type=str, required=True, help="Hugging Face repo, e.g., presightai/arabic-image-to-markdown-qwen2.5vl-7b-instruct-lora")
231
+ parser.add_argument("--image_path", type=str, required=True, help="Path to input image (e.g., input.jpg)")
232
+ parser.add_argument("--question", type=str, default="Extract the content in markdown format.", help="Question/prompt")
233
+
234
+ args = parser.parse_args()
235
+
236
+ model, processor = load_model_with_qlora(args.model_repo)
237
+ result = inference(model, processor, args.image_path, args.question)
238
+
239
+ print("\n=== Model Output ===")
240
+ print(result)
241
+ ```
242
+ ## Command
243
+ ```bash
244
+ python3 arabic_ocr.py --model_repo presightai/arabic-image-to-markdown-qwen2.5vl-7b-instruct-lora --image_path input.jpg
245
  ```
246
 
247
  ## Inference API and Smart Chunking to handle large token size