captions

Runtime error

App Files Files Community

royyy74 commited on Jul 7

Commit

4bd4500

verified ·

1 Parent(s): 84d516b

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -67

app.py CHANGED Viewed

@@ -1,6 +1,3 @@
-import spaces
-import gradio as gr
-from huggingface_hub import InferenceClient
 from torch import nn
 from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
 from pathlib import Path
@@ -9,11 +6,26 @@ import torch.amp.autocast_mode
 from PIL import Image
 import os
 import torchvision.transforms.functional as TVF
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 CHECKPOINT_PATH = Path("cgrkzexw-599808")
-TITLE = "<h1><center>JoyCaption Alpha Two (2024-09-26a)</center></h1>"
 CAPTION_TYPE_MAP = {
 	"Descriptive": [
 		"Write a descriptive caption for this image in a formal tone.",
@@ -62,7 +74,8 @@ CAPTION_TYPE_MAP = {
 	],
 }
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
 class ImageAdapter(nn.Module):
@@ -165,8 +178,8 @@ if device.type == 'cuda':
 image_adapter.to(device)
-@spaces.GPU() # We keep this decorator for now, assuming GPU is preferred if available
-@torch.no_grad()
 def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str, extra_options: list[str], name_input: str, custom_prompt: str) -> tuple[str, str]:
 	if device.type == "cuda":
 		torch.cuda.empty_cache()
@@ -306,66 +319,50 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str
 	return prompt_str, caption.strip()
-with gr.Blocks() as demo:
-	gr.HTML(TITLE)
-	if device.type == 'cpu':
-		gr.Markdown("**Warning: Running on CPU.** Captions may take a very long time to generate (potentially several minutes). For faster performance, please use a Space with GPU hardware.")
-	with gr.Row():
-		with gr.Column():
-			input_image = gr.Image(type="pil", label="Input Image")
-			caption_type = gr.Dropdown(
-				choices=["Descriptive", "Descriptive (Informal)", "Training Prompt", "MidJourney", "Booru tag list", "Booru-like tag list", "Art Critic", "Product Listing", "Social Media Post"],
-				label="Caption Type",
-				value="Descriptive",
-			)
-			caption_length = gr.Dropdown(
-				choices=["any", "very short", "short", "medium-length", "long", "very long"] +
-						[str(i) for i in range(20, 261, 10)],
-				label="Caption Length",
-				value="long",
-			)
-			extra_options = gr.CheckboxGroup(
-				choices=[
-					"If there is a person/character in the image you must refer to them as {name}.",
-					"Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
-					"Include information about lighting.",
-					"Include information about camera angle.",
-					"Include information about whether there is a watermark or not.",
-					"Include information about whether there are JPEG artifacts or not.",
-					"If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.",
-					"Do NOT include anything sexual; keep it PG.",
-					"Do NOT mention the image's resolution.",
-					"You MUST include information about the subjective aesthetic quality of the image from low to very high.",
-					"Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry.",
-					"Do NOT mention any text that is in the image.",
-					"Specify the depth of field and whether the background is in focus or blurred.",
-					"If applicable, mention the likely use of artificial or natural lighting sources.",
-					"Do NOT use any ambiguous language.",
-					"Include whether the image is sfw, suggestive, or nsfw.",
-					"ONLY describe the most important elements of the image."
-				],
-				label="Extra Options"
-			)
-			name_input = gr.Textbox(label="Person/Character Name (if applicable)")
-			gr.Markdown("**Note:** Name input is only used if an Extra Option is selected that requires it.")
-			custom_prompt = gr.Textbox(label="Custom Prompt (optional, will override all other settings)")
-			gr.Markdown("**Note:** Alpha Two is not a general instruction follower and will not follow prompts outside its training data well. Use this feature with caution.")
-			run_button = gr.Button("Caption")
-		with gr.Column():
-			output_prompt = gr.Textbox(label="Prompt that was used")
-			output_caption = gr.Textbox(label="Caption")
-	run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_length, extra_options, name_input, custom_prompt], outputs=[output_prompt, output_caption])
 if __name__ == "__main__":
-    demo.launch(share=True)

 from torch import nn
 from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
 from pathlib import Path
 from PIL import Image
 import os
 import torchvision.transforms.functional as TVF
+import io
+import json # For parsing extra_options_json
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from pydantic import BaseModel
+from typing import List, Tuple # Tuple for stream_chat return type hint
+# FastAPI App Initialization
+app = FastAPI()
+# Pydantic model for API response
+class CaptionResponse(BaseModel):
+    prompt_that_was_used: str
+    caption: str
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 CHECKPOINT_PATH = Path("cgrkzexw-599808")
+# TITLE is not used for API
 CAPTION_TYPE_MAP = {
 	"Descriptive": [
 		"Write a descriptive caption for this image in a formal tone.",
 	],
 }
+# HF_TOKEN is not used in the API version
+# HF_TOKEN = os.environ.get("HF_TOKEN", None)
 class ImageAdapter(nn.Module):
 image_adapter.to(device)
+# torch.no_grad() will be applied by FastAPI for endpoint or can be kept if function is called elsewhere
+@torch.no_grad()
 def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str, extra_options: list[str], name_input: str, custom_prompt: str) -> tuple[str, str]:
 	if device.type == "cuda":
 		torch.cuda.empty_cache()
 	return prompt_str, caption.strip()
+@app.post("/caption_image/", response_model=CaptionResponse)
+async def caption_image_endpoint(
+    image_file: UploadFile = File(...),
+    caption_type: str = Form(...),
+    caption_length: str = Form(...),
+    extra_options_json: str = Form("[]"), # Expect a JSON string for list of options
+    name_input: str = Form(""),
+    custom_prompt: str = Form("")
+):
+    try:
+        # Read image file
+        image_bytes = await image_file.read()
+        input_image = Image.open(io.BytesIO(image_bytes))
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid image file: {e}")
+    try:
+        # Parse extra_options from JSON string
+        extra_options = json.loads(extra_options_json)
+        if not isinstance(extra_options, list):
+            raise ValueError("extra_options_json must be a JSON list")
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=f"Invalid extra_options_json: {e}")
+    # Call the existing stream_chat function
+    # Ensure stream_chat is compatible with these inputs
+    try:
+        prompt_used, generated_caption = stream_chat(
+            input_image=input_image,
+            caption_type=caption_type,
+            caption_length=caption_length,
+            extra_options=extra_options,
+            name_input=name_input,
+            custom_prompt=custom_prompt
+        )
+        return CaptionResponse(prompt_that_was_used=prompt_used, caption=generated_caption)
+    except ValueError as e: # Catch specific errors from stream_chat like invalid caption_length
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        # General error catch for unexpected issues during model processing
+        print(f"Error during caption generation: {e}") # Log for server visibility
+        raise HTTPException(status_code=500, detail="Internal server error during caption generation.")
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)