captions

Runtime error

App Files Files Community

royyy74 commited on Jul 7

Commit

656168e

verified ·

1 Parent(s): 8a4f7ca

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -19

app.py CHANGED Viewed

@@ -119,6 +119,10 @@ class ImageAdapter(nn.Module):
 # Load CLIP
 print("Loading CLIP")
 clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
@@ -134,7 +138,9 @@ del checkpoint
 clip_model.eval()
 clip_model.requires_grad_(False)
-clip_model.to("cuda")
 # Tokenizer
@@ -145,21 +151,25 @@ assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTr
 # LLM
 print("Loading LLM")
 print("Loading VLM's custom text model")
-text_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16)
 text_model.eval()
 # Image Adapter
 print("Loading image adapter")
 image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
-image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"))
 image_adapter.eval()
-image_adapter.to("cuda")
-@spaces.GPU()
 @torch.no_grad()
 def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str | int, extra_options: list[str], name_input: str, custom_prompt: str) -> tuple[str, str]:
-	torch.cuda.empty_cache()
 	# 'any' means no length specified
 	length = None if caption_length == "any" else caption_length
@@ -201,14 +211,32 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str
 	image = input_image.resize((384, 384), Image.LANCZOS)
 	pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
 	pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-	pixel_values = pixel_values.to('cuda')
 	# Embed image
 	# This results in Batch x Image Tokens x Features
-	with torch.amp.autocast_mode.autocast('cuda', enabled=True):
 		vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
 		embedded_images = image_adapter(vision_outputs.hidden_states)
-		embedded_images = embedded_images.to('cuda')
 	# Build the conversation
 	convo = [
@@ -228,34 +256,38 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str
 	# Tokenize the conversation
 	# prompt_str is tokenized separately so we can do the calculations below
-	convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
-	prompt_tokens = tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False)
 	assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
 	convo_tokens = convo_tokens.squeeze(0)   # Squeeze just to make the following easier
 	prompt_tokens = prompt_tokens.squeeze(0)
 	# Calculate where to inject the image
-	eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[0].tolist()
 	assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
 	preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]   # Number of tokens before the prompt
 	# Embed the tokens
-	convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to('cuda'))
 	# Construct the input
 	input_embeds = torch.cat([
 		convo_embeds[:, :preamble_len],   # Part before the prompt
-		embedded_images.to(dtype=convo_embeds.dtype),   # Image
 		convo_embeds[:, preamble_len:],   # The prompt and anything after it
-	], dim=1).to('cuda')
 	input_ids = torch.cat([
 		convo_tokens[:preamble_len].unsqueeze(0),
-		torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),   # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input)
 		convo_tokens[preamble_len:].unsqueeze(0),
-	], dim=1).to('cuda')
-	attention_mask = torch.ones_like(input_ids)
 	# Debugging
 	print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
@@ -277,6 +309,9 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str
 with gr.Blocks() as demo:
 	gr.HTML(TITLE)
 	with gr.Row():
 		with gr.Column():
 			input_image = gr.Image(type="pil", label="Input Image")
@@ -333,4 +368,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
-    demo.launch()

+# Determine device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
 # Load CLIP
 print("Loading CLIP")
 clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
 clip_model.eval()
 clip_model.requires_grad_(False)
+if device.type == 'cuda':
+	clip_model = clip_model.to(dtype=torch.bfloat16) # Convert to bfloat16 if on CUDA
+clip_model.to(device)
 # Tokenizer
 # LLM
 print("Loading LLM")
 print("Loading VLM's custom text model")
+# Use device_map="auto" to allow accelerate to handle model placement, including CPU
+text_model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH / "text_model", device_map="auto", torch_dtype=torch.bfloat16)
 text_model.eval()
 # Image Adapter
 print("Loading image adapter")
 image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size, False, False, 38, False)
+image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu")) # Load to CPU first
 image_adapter.eval()
+if device.type == 'cuda':
+	image_adapter = image_adapter.to(dtype=torch.bfloat16) # Convert to bfloat16 if on CUDA
+image_adapter.to(device)
+@spaces.GPU() # We keep this decorator for now, assuming GPU is preferred if available
 @torch.no_grad()
 def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str | int, extra_options: list[str], name_input: str, custom_prompt: str) -> tuple[str, str]:
+	if device.type == "cuda":
+		torch.cuda.empty_cache()
 	# 'any' means no length specified
 	length = None if caption_length == "any" else caption_length
 	image = input_image.resize((384, 384), Image.LANCZOS)
 	pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
 	pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+	if device.type == 'cuda':
+		pixel_values = pixel_values.to(device, dtype=torch.bfloat16)
+	else:
+		pixel_values = pixel_values.to(device) # CPU will use float32
 	# Embed image
 	# This results in Batch x Image Tokens x Features
+	# For CPU, autocast can use bfloat16 if available and beneficial, or can be disabled.
+	# For simplicity here, we'll enable it for CPU with bfloat16 if PyTorch supports it, else float32.
+	# Note: True CPU mixed precision benefits depend on CPU architecture and PyTorch version.
+	autocast_device_type = device.type
+	autocast_kwargs = {'enabled': True}
+	if autocast_device_type == 'cpu':
+		# Check if bfloat16 is supported for CPU autocast, otherwise default to float32 by not specifying dtype
+		# This check might be more involved depending on PyTorch version; for now, let's assume it handles it.
+		# Or, explicitly set dtype if needed: autocast_kwargs['dtype'] = torch.bfloat16
+		pass
+	with torch.amp.autocast_mode.autocast(autocast_device_type, **autocast_kwargs):
 		vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
 		embedded_images = image_adapter(vision_outputs.hidden_states)
+		# embedded_images are already on the correct device due to image_adapter.to(device)
+		# and operations within adapter should respect input tensor's device.
+		# Explicitly moving again to be safe, though may be redundant.
+		embedded_images = embedded_images.to(device)
 	# Build the conversation
 	convo = [
 	# Tokenize the conversation
 	# prompt_str is tokenized separately so we can do the calculations below
+	convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False).to(device)
+	prompt_tokens = tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False).to(device)
 	assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
 	convo_tokens = convo_tokens.squeeze(0)   # Squeeze just to make the following easier
 	prompt_tokens = prompt_tokens.squeeze(0)
 	# Calculate where to inject the image
+	# Ensure convo_tokens is on the CPU for this kind of operation if it involves list conversion or complex indexing not ideal for GPU
+	eot_id_indices = (convo_tokens.cpu() == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[0].tolist()
 	assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
 	preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]   # Number of tokens before the prompt
 	# Embed the tokens
+	convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to(text_model.device)) # Ensure tokens are on same device as text_model
 	# Construct the input
+	# Ensure all parts are on the same device before concatenation
 	input_embeds = torch.cat([
 		convo_embeds[:, :preamble_len],   # Part before the prompt
+		embedded_images.to(dtype=convo_embeds.dtype, device=convo_embeds.device),   # Image, ensure same dtype and device
 		convo_embeds[:, preamble_len:],   # The prompt and anything after it
+	], dim=1)
+	# input_embeds will be on the device of convo_embeds (i.e. text_model.device)
 	input_ids = torch.cat([
 		convo_tokens[:preamble_len].unsqueeze(0),
+		torch.zeros((1, embedded_images.shape[1]), dtype=torch.long, device=convo_tokens.device),   # Dummy tokens for the image
 		convo_tokens[preamble_len:].unsqueeze(0),
+	], dim=1)
+	# input_ids will be on the device of convo_tokens
+	attention_mask = torch.ones_like(input_ids) # Will be on the same device as input_ids
 	# Debugging
 	print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
 with gr.Blocks() as demo:
 	gr.HTML(TITLE)
+	if device.type == 'cpu':
+		gr.Markdown("**Warning: Running on CPU.** Captions may take a very long time to generate (potentially several minutes). For faster performance, please use a Space with GPU hardware.")
 	with gr.Row():
 		with gr.Column():
 			input_image = gr.Image(type="pil", label="Input Image")
 if __name__ == "__main__":
+    demo.launch()