Update app.py
Browse files
app.py
CHANGED
@@ -196,14 +196,31 @@ if device.type == 'cuda':
|
|
196 |
print(f"Failed to load LLM on CUDA: {e}")
|
197 |
raise
|
198 |
else:
|
199 |
-
# CPU-only loading:
|
200 |
-
print("Attempting to load LLM on CPU with float32...")
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
text_model.eval()
|
209 |
|
@@ -408,4 +425,4 @@ async def caption_image_endpoint(
|
|
408 |
|
409 |
if __name__ == "__main__":
|
410 |
import uvicorn
|
411 |
-
uvicorn.run(app, host="0.0.0.0", port=
|
|
|
196 |
print(f"Failed to load LLM on CUDA: {e}")
|
197 |
raise
|
198 |
else:
|
199 |
+
# CPU-only loading: Attempt float32 first, then fallback to offloading if it fails.
|
200 |
+
print("Attempting to load LLM on CPU with float32 (direct to RAM)...")
|
201 |
+
try:
|
202 |
+
text_model = AutoModelForCausalLM.from_pretrained(
|
203 |
+
CHECKPOINT_PATH / "text_model",
|
204 |
+
device_map={'': 'cpu'}, # Explicitly map all to CPU
|
205 |
+
torch_dtype=torch.float32 # Use float32 for CPU
|
206 |
+
)
|
207 |
+
print("LLM loaded on CPU with float32 (direct to RAM).")
|
208 |
+
except Exception as e_cpu_direct: # Catch a broad exception, could be OOM (RuntimeError) or others
|
209 |
+
print(f"Direct CPU float32 loading failed: {e_cpu_direct}")
|
210 |
+
print("Attempting to load LLM on CPU with disk offloading (float32)...")
|
211 |
+
try:
|
212 |
+
model_offload_dir_cpu = TemporaryDirectory().name
|
213 |
+
text_model = AutoModelForCausalLM.from_pretrained(
|
214 |
+
CHECKPOINT_PATH / "text_model",
|
215 |
+
device_map="auto", # Allow accelerate to use CPU and disk
|
216 |
+
torch_dtype=torch.float32,
|
217 |
+
offload_folder=model_offload_dir_cpu,
|
218 |
+
offload_state_dict=True
|
219 |
+
)
|
220 |
+
print(f"LLM loaded on CPU with offloading to {model_offload_dir_cpu}. WARNING: This will be very slow.")
|
221 |
+
except Exception as e_cpu_offload:
|
222 |
+
print(f"CPU loading with disk offloading also failed: {e_cpu_offload}")
|
223 |
+
raise # Re-raise the exception if all CPU loading strategies fail
|
224 |
|
225 |
text_model.eval()
|
226 |
|
|
|
425 |
|
426 |
if __name__ == "__main__":
|
427 |
import uvicorn
|
428 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|