RefurnishAI

Runtime error

App Files Files Community

RefurnishAI / merged_app2.py

Ashoka74

Update merged_app2.py

f8551d1 verified 11 months ago

raw

history blame

76 kB

	import os
	import random
	import sys
	from typing import Sequence, Mapping, Any, Union
	import torch
	import gradio as gr
	from PIL import Image, ImageDraw
	from huggingface_hub import hf_hub_download
	import spaces

	import argparse
	import random

	import os
	import math
	import gradio as gr
	import numpy as np
	import torch
	import safetensors.torch as sf
	import datetime
	from pathlib import Path
	from io import BytesIO

	from PIL import Image
	from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
	from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler, EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler
	from diffusers.models.attention_processor import AttnProcessor2_0
	from transformers import CLIPTextModel, CLIPTokenizer
	import dds_cloudapi_sdk
	from dds_cloudapi_sdk import Config, Client, TextPrompt
	from dds_cloudapi_sdk.tasks.dinox import DinoxTask
	from dds_cloudapi_sdk.tasks import DetectionTarget
	from dds_cloudapi_sdk.tasks.detection import DetectionTask
	from transformers import AutoModelForImageSegmentation


	from enum import Enum
	from torch.hub import download_url_to_file
	import tempfile

	from sam2.build_sam import build_sam2

	from sam2.sam2_image_predictor import SAM2ImagePredictor
	import cv2

	from transformers import AutoModelForImageSegmentation
	from inference_i2mv_sdxl import prepare_pipeline, remove_bg, run_pipeline
	from torchvision import transforms


	from typing import Optional

	from depth_anything_v2.dpt import DepthAnythingV2

	import httpx


	import gradio as gr
	import torch
	from diffusers import FluxFillPipeline
	from diffusers.utils import load_image
	from PIL import Image, ImageDraw
	import numpy as np
	import spaces
	from huggingface_hub import hf_hub_download
	import openai
	from openai import OpenAI
	import gradio as gr
	import os
	from PIL import Image
	import numpy as np
	import io
	import base64


	MAX_IMAGE_WIDTH = 2048
	IMAGE_FORMAT = "JPEG"



	client = httpx.Client(timeout=httpx.Timeout(10.0)) # Set timeout to 10 seconds
	NUM_VIEWS = 6
	HEIGHT = 768
	WIDTH = 768
	MAX_SEED = np.iinfo(np.int32).max



	import supervision as sv
	import torch
	from PIL import Image

	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

	transform_image = transforms.Compose(
	[
	transforms.Resize((1024, 1024)),
	transforms.ToTensor(),
	transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
	]
	)

	#hf_hub_download(repo_id="YarvixPA/FLUX.1-Fill-dev-gguf", filename="flux1-fill-dev-Q5_K_S.gguf", local_dir="models/")



	# Load

	# Model paths
	model_path = './models/iclight_sd15_fc.safetensors'
	model_path2 = './checkpoints/depth_anything_v2_vits.pth'
	model_path3 = './checkpoints/sam2_hiera_large.pt'
	model_path4 = './checkpoints/config.json'
	model_path5 = './checkpoints/preprocessor_config.json'
	model_path6 = './configs/sam2_hiera_l.yaml'
	model_path7 = './mvadapter_i2mv_sdxl.safetensors'

	# Base URL for the repository
	BASE_URL = 'https://huggingface.co/Ashoka74/Placement/resolve/main/'

	# Model URLs
	model_urls = {
	model_path: 'iclight_sd15_fc.safetensors',
	model_path2: 'depth_anything_v2_vits.pth',
	model_path3: 'sam2_hiera_large.pt',
	model_path4: 'config.json',
	model_path5: 'preprocessor_config.json',
	model_path6: 'sam2_hiera_l.yaml',
	model_path7: 'mvadapter_i2mv_sdxl.safetensors'
	}

	# Ensure directories exist
	def ensure_directories():
	for path in model_urls.keys():
	os.makedirs(os.path.dirname(path), exist_ok=True)

	# Download models
	def download_models():
	for local_path, filename in model_urls.items():
	if not os.path.exists(local_path):
	try:
	url = f"{BASE_URL}{filename}"
	print(f"Downloading {filename}")
	download_url_to_file(url, local_path)
	print(f"Successfully downloaded {filename}")
	except Exception as e:
	print(f"Error downloading {filename}: {e}")

	ensure_directories()
	download_models()


	hf_hub_download(repo_id="black-forest-labs/FLUX.1-Redux-dev", filename="flux1-redux-dev.safetensors", local_dir="models/style_models")
	hf_hub_download(repo_id="black-forest-labs/FLUX.1-Depth-dev", filename="flux1-depth-dev.safetensors", local_dir="models/diffusion_models")
	hf_hub_download(repo_id="Comfy-Org/sigclip_vision_384", filename="sigclip_vision_patch14_384.safetensors", local_dir="models/clip_vision")
	hf_hub_download(repo_id="Kijai/DepthAnythingV2-safetensors", filename="depth_anything_v2_vitl_fp32.safetensors", local_dir="models/depthanything")
	hf_hub_download(repo_id="black-forest-labs/FLUX.1-dev", filename="ae.safetensors", local_dir="models/vae/FLUX1")
	hf_hub_download(repo_id="comfyanonymous/flux_text_encoders", filename="clip_l.safetensors", local_dir="models/text_encoders")
	t5_path = hf_hub_download(repo_id="comfyanonymous/flux_text_encoders", filename="t5xxl_fp16.safetensors", local_dir="models/text_encoders/t5")


	sd15_name = 'stablediffusionapi/realistic-vision-v51'
	tokenizer = CLIPTokenizer.from_pretrained(sd15_name, subfolder="tokenizer")
	text_encoder = CLIPTextModel.from_pretrained(sd15_name, subfolder="text_encoder")
	vae = AutoencoderKL.from_pretrained(sd15_name, subfolder="vae")
	unet = UNet2DConditionModel.from_pretrained(sd15_name, subfolder="unet")


	from diffusers import FluxTransformer2DModel, FluxFillPipeline, GGUFQuantizationConfig
	from transformers import T5EncoderModel
	import torch



	ckpt_path = (
	"https://huggingface.co/SporkySporkness/FLUX.1-Fill-dev-GGUF/flux1-fill-dev-fp16-Q5_0-GGUF.gguf"
	)

	transformer = FluxTransformer2DModel.from_single_file(
	ckpt_path,
	quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
	torch_dtype=torch.bfloat16,
	)

	fill_pipe = FluxFillPipeline.from_pretrained(
	"black-forest-labs/FLUX.1-Fill-dev",
	transformer=transformer,
	generator=torch.manual_seed(0),
	torch_dtype=torch.bfloat16,
	)


	try:
	import xformers
	import xformers.ops
	XFORMERS_AVAILABLE = True
	print("xformers is available - Using memory efficient attention")
	except ImportError:
	XFORMERS_AVAILABLE = False
	print("xformers not available - Using default attention")

	# fill_pipe.enable_model_cpu_offload()
	# fill_pipe.enable_vae_slicing()
	#fill_pipe.enable_xformers_memory_efficient_attention()


	# Memory optimizations for RTX 2070
	torch.backends.cudnn.benchmark = True
	if torch.cuda.is_available():
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	# Set a smaller attention slice size for RTX 2070
	torch.backends.cuda.max_split_size_mb = 512
	device = torch.device('cuda')
	else:
	device = torch.device('cpu')


	rmbg = AutoModelForImageSegmentation.from_pretrained(
	"ZhengPeng7/BiRefNet", trust_remote_code=True
	)
	rmbg = rmbg.to(device=device, dtype=torch.float32)


	model = DepthAnythingV2(encoder='vits', features=64, out_channels=[48, 96, 192, 384])
	model.load_state_dict(torch.load('checkpoints/depth_anything_v2_vits.pth', map_location=device))
	model = model.to(device)
	model.eval()


	with torch.no_grad():
	new_conv_in = torch.nn.Conv2d(8, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)
	new_conv_in.weight.zero_()
	new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
	new_conv_in.bias = unet.conv_in.bias
	unet.conv_in = new_conv_in


	unet_original_forward = unet.forward


	def can_expand(source_width, source_height, target_width, target_height, alignment):
	if alignment in ("Left", "Right") and source_width >= target_width:
	return False
	if alignment in ("Top", "Bottom") and source_height >= target_height:
	return False
	return True

	def prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom):
	target_size = (width, height)

	scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
	new_width = int(image.width * scale_factor)
	new_height = int(image.height * scale_factor)

	source = image.resize((new_width, new_height), Image.LANCZOS)

	if resize_option == "Full":
	resize_percentage = 100
	elif resize_option == "75%":
	resize_percentage = 75
	elif resize_option == "50%":
	resize_percentage = 50
	elif resize_option == "33%":
	resize_percentage = 33
	elif resize_option == "25%":
	resize_percentage = 25
	else: # Custom
	resize_percentage = custom_resize_percentage

	# Calculate new dimensions based on percentage
	resize_factor = resize_percentage / 100
	new_width = int(source.width * resize_factor)
	new_height = int(source.height * resize_factor)

	# Ensure minimum size of 64 pixels
	new_width = max(new_width, 64)
	new_height = max(new_height, 64)

	# Resize the image
	source = source.resize((new_width, new_height), Image.LANCZOS)

	# Calculate the overlap in pixels based on the percentage
	overlap_x = int(new_width * (overlap_percentage / 100))
	overlap_y = int(new_height * (overlap_percentage / 100))

	# Ensure minimum overlap of 1 pixel
	overlap_x = max(overlap_x, 1)
	overlap_y = max(overlap_y, 1)

	# Calculate margins based on alignment
	if alignment == "Middle":
	margin_x = (target_size[0] - new_width) // 2
	margin_y = (target_size[1] - new_height) // 2
	elif alignment == "Left":
	margin_x = 0
	margin_y = (target_size[1] - new_height) // 2
	elif alignment == "Right":
	margin_x = target_size[0] - new_width
	margin_y = (target_size[1] - new_height) // 2
	elif alignment == "Top":
	margin_x = (target_size[0] - new_width) // 2
	margin_y = 0
	elif alignment == "Bottom":
	margin_x = (target_size[0] - new_width) // 2
	margin_y = target_size[1] - new_height

	# Adjust margins to eliminate gaps
	margin_x = max(0, min(margin_x, target_size[0] - new_width))
	margin_y = max(0, min(margin_y, target_size[1] - new_height))

	# Create a new background image and paste the resized source image
	background = Image.new('RGB', target_size, (255, 255, 255))
	background.paste(source, (margin_x, margin_y))

	# Create the mask
	mask = Image.new('L', target_size, 255)
	mask_draw = ImageDraw.Draw(mask)

	# Calculate overlap areas
	white_gaps_patch = 2

	left_overlap = margin_x + overlap_x if overlap_left else margin_x + white_gaps_patch
	right_overlap = margin_x + new_width - overlap_x if overlap_right else margin_x + new_width - white_gaps_patch
	top_overlap = margin_y + overlap_y if overlap_top else margin_y + white_gaps_patch
	bottom_overlap = margin_y + new_height - overlap_y if overlap_bottom else margin_y + new_height - white_gaps_patch

	if alignment == "Left":
	left_overlap = margin_x + overlap_x if overlap_left else margin_x
	elif alignment == "Right":
	right_overlap = margin_x + new_width - overlap_x if overlap_right else margin_x + new_width
	elif alignment == "Top":
	top_overlap = margin_y + overlap_y if overlap_top else margin_y
	elif alignment == "Bottom":
	bottom_overlap = margin_y + new_height - overlap_y if overlap_bottom else margin_y + new_height

	# Draw the mask
	mask_draw.rectangle([
	(left_overlap, top_overlap),
	(right_overlap, bottom_overlap)
	], fill=0)

	return background, mask

	@spaces.GPU(duration=60)
	@torch.inference_mode()
	def inpaint(image, width, height, overlap_percentage, num_inference_steps, resize_option, custom_resize_percentage, prompt_input, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom, progress=gr.Progress(track_tqdm=True)):

	background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom)

	if not can_expand(background.width, background.height, width, height, alignment):
	alignment = "Middle"

	cnet_image = background.copy()
	cnet_image.paste(0, (0, 0), mask)

	final_prompt = prompt_input

	#generator = torch.Generator(device="cuda").manual_seed(42)

	result = fill_pipe(
	prompt=final_prompt,
	height=height,
	width=width,
	image=cnet_image,
	mask_image=mask,
	num_inference_steps=num_inference_steps,
	guidance_scale=30,
	).images[0]

	result = result.convert("RGBA")
	cnet_image.paste(result, (0, 0), mask)

	return cnet_image #, background

	def preview_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom):
	background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom)

	preview = background.copy().convert('RGBA')
	red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64))
	red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0))
	red_mask.paste(red_overlay, (0, 0), mask)
	preview = Image.alpha_composite(preview, red_mask)

	return preview

	def clear_result():
	return gr.update(value=None)

	def preload_presets(target_ratio, ui_width, ui_height):
	if target_ratio == "9:16":
	return 720, 1280, gr.update()
	elif target_ratio == "16:9":
	return 1280, 720, gr.update()
	elif target_ratio == "1:1":
	return 1024, 1024, gr.update()
	elif target_ratio == "Custom":
	return ui_width, ui_height, gr.update(open=True)

	def select_the_right_preset(user_width, user_height):
	if user_width == 720 and user_height == 1280:
	return "9:16"
	elif user_width == 1280 and user_height == 720:
	return "16:9"
	elif user_width == 1024 and user_height == 1024:
	return "1:1"
	else:
	return "Custom"

	def toggle_custom_resize_slider(resize_option):
	return gr.update(visible=(resize_option == "Custom"))

	def update_history(new_image, history):
	if history is None:
	history = []
	history.insert(0, new_image)
	return history


	def enable_efficient_attention():
	if XFORMERS_AVAILABLE:
	try:
	# RTX 2070 specific settings
	unet.set_use_memory_efficient_attention_xformers(True)
	vae.set_use_memory_efficient_attention_xformers(True)
	print("Enabled xformers memory efficient attention")
	except Exception as e:
	print(f"Xformers error: {e}")
	print("Falling back to sliced attention")
	# Use sliced attention for RTX 2070
	# unet.set_attention_slice_size(4)
	# vae.set_attention_slice_size(4)
	unet.set_attn_processor(AttnProcessor2_0())
	vae.set_attn_processor(AttnProcessor2_0())
	else:
	# Fallback for when xformers is not available
	print("Using sliced attention")
	# unet.set_attention_slice_size(4)
	# vae.set_attention_slice_size(4)
	unet.set_attn_processor(AttnProcessor2_0())
	vae.set_attn_processor(AttnProcessor2_0())

	# Add memory clearing function
	def clear_memory():
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	#torch.cuda.synchronize()

	# Enable efficient attention
	enable_efficient_attention()


	def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs):
	c_concat = kwargs['cross_attention_kwargs']['concat_conds'].to(sample)
	c_concat = torch.cat([c_concat] * (sample.shape[0] // c_concat.shape[0]), dim=0)
	new_sample = torch.cat([sample, c_concat], dim=1)
	kwargs['cross_attention_kwargs'] = {}
	return unet_original_forward(new_sample, timestep, encoder_hidden_states, **kwargs)


	unet.forward = hooked_unet_forward


	sd_offset = sf.load_file(model_path)
	sd_origin = unet.state_dict()
	keys = sd_origin.keys()
	sd_merged = {k: sd_origin[k] + sd_offset[k] for k in sd_origin.keys()}
	unet.load_state_dict(sd_merged, strict=True)
	del sd_offset, sd_origin, sd_merged, keys


	# Device and dtype setup
	device = torch.device('cuda')
	#dtype = torch.float16 # RTX 2070 works well with float16
	dtype = torch.bfloat16


	pipe = prepare_pipeline(
	base_model="stabilityai/stable-diffusion-xl-base-1.0",
	vae_model="madebyollin/sdxl-vae-fp16-fix",
	unet_model=None,
	lora_model=None,
	adapter_path="huanngzh/mv-adapter",
	scheduler=None,
	num_views=NUM_VIEWS,
	device=device,
	dtype=dtype,
	)

	# pipe.enable_model_cpu_offload()
	# pipe.enable_vae_slicing()
	#pipe.enable_xformers_memory_efficient_attention()

	# Move models to device with consistent dtype
	text_encoder = text_encoder.to(device=device, dtype=dtype)
	vae = vae.to(device=device, dtype=dtype) # Changed from bfloat16 to float16
	unet = unet.to(device=device, dtype=dtype)
	#rmbg = rmbg.to(device=device, dtype=torch.float32) # Keep this as float32
	rmbg = rmbg.to(device)

	ddim_scheduler = DDIMScheduler(
	num_train_timesteps=1000,
	beta_start=0.00085,
	beta_end=0.012,
	beta_schedule="scaled_linear",
	clip_sample=False,
	set_alpha_to_one=False,
	steps_offset=1,
	)

	euler_a_scheduler = EulerAncestralDiscreteScheduler(
	num_train_timesteps=1000,
	beta_start=0.00085,
	beta_end=0.012,
	steps_offset=1
	)

	dpmpp_2m_sde_karras_scheduler = DPMSolverMultistepScheduler(
	num_train_timesteps=1000,
	beta_start=0.00085,
	beta_end=0.012,
	algorithm_type="sde-dpmsolver++",
	use_karras_sigmas=True,
	steps_offset=1
	)

	# Pipelines


	t2i_pipe = StableDiffusionPipeline(
	vae=vae,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=dpmpp_2m_sde_karras_scheduler,
	safety_checker=None,
	requires_safety_checker=False,
	feature_extractor=None,
	image_encoder=None
	)

	# t2i_pipe.enable_model_cpu_offload()
	# t2i_pipe.enable_vae_slicing()
	#t2i_pipe.enable_xformers_memory_efficient_attention()

	i2i_pipe = StableDiffusionImg2ImgPipeline(
	vae=vae,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=dpmpp_2m_sde_karras_scheduler,
	safety_checker=None,
	requires_safety_checker=False,
	feature_extractor=None,
	image_encoder=None
	)

	# i2i_pipe.enable_model_cpu_offload()
	# i2i_pipe.enable_vae_slicing()
	#i2i_pipe.enable_xformers_memory_efficient_attention()

	@torch.inference_mode()
	def encode_prompt_inner(txt: str):
	max_length = tokenizer.model_max_length
	chunk_length = tokenizer.model_max_length - 2
	id_start = tokenizer.bos_token_id
	id_end = tokenizer.eos_token_id
	id_pad = id_end

	def pad(x, p, i):
	return x[:i] if len(x) >= i else x + [p] * (i - len(x))

	tokens = tokenizer(txt, truncation=False, add_special_tokens=False)["input_ids"]
	chunks = [[id_start] + tokens[i: i + chunk_length] + [id_end] for i in range(0, len(tokens), chunk_length)]
	chunks = [pad(ck, id_pad, max_length) for ck in chunks]

	token_ids = torch.tensor(chunks).to(device=device, dtype=torch.int64)
	conds = text_encoder(token_ids).last_hidden_state

	return conds


	@torch.inference_mode()
	def encode_prompt_pair(positive_prompt, negative_prompt):
	c = encode_prompt_inner(positive_prompt)
	uc = encode_prompt_inner(negative_prompt)

	c_len = float(len(c))
	uc_len = float(len(uc))
	max_count = max(c_len, uc_len)
	c_repeat = int(math.ceil(max_count / c_len))
	uc_repeat = int(math.ceil(max_count / uc_len))
	max_chunk = max(len(c), len(uc))

	c = torch.cat([c] * c_repeat, dim=0)[:max_chunk]
	uc = torch.cat([uc] * uc_repeat, dim=0)[:max_chunk]

	c = torch.cat([p[None, ...] for p in c], dim=1)
	uc = torch.cat([p[None, ...] for p in uc], dim=1)

	return c, uc


	@spaces.GPU(duration=60)
	@torch.inference_mode()
	def infer(
	prompt,
	image, # This is already RGBA with background removed
	do_rembg=True,
	seed=42,
	randomize_seed=False,
	guidance_scale=3.0,
	num_inference_steps=30,
	reference_conditioning_scale=1.0,
	negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast",
	progress=gr.Progress(track_tqdm=True),
	):
	clear_memory()

	# Convert input to PIL if needed
	if isinstance(image, np.ndarray):
	if image.shape[-1] == 4: # RGBA
	image = Image.fromarray(image, 'RGBA')
	else: # RGB
	image = Image.fromarray(image, 'RGB')

	#logging.info(f"Converted to PIL Image mode: {image.mode}")

	# No need for remove_bg_fn since image is already processed
	remove_bg_fn = None

	if randomize_seed:
	seed = random.randint(0, MAX_SEED)

	images, preprocessed_image = run_pipeline(
	pipe,
	num_views=NUM_VIEWS,
	text=prompt,
	image=image,
	height=HEIGHT,
	width=WIDTH,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	seed=seed,
	remove_bg_fn=remove_bg_fn, # Set to None since preprocessing is done
	reference_conditioning_scale=reference_conditioning_scale,
	negative_prompt=negative_prompt,
	device=device,
	)

	# logging.info(f"Output images shape: {[img.shape for img in images]}")
	# logging.info(f"Preprocessed image shape: {preprocessed_image.shape if preprocessed_image is not None else None}")
	return images


	@spaces.GPU(duration=60)
	@torch.inference_mode()
	def pytorch2numpy(imgs, quant=True):
	results = []
	for x in imgs:
	y = x.movedim(0, -1)

	if quant:
	y = y * 127.5 + 127.5
	y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
	else:
	y = y * 0.5 + 0.5
	y = y.detach().float().cpu().numpy().clip(0, 1).astype(np.float32)

	results.append(y)
	return results

	@spaces.GPU(duration=60)
	@torch.inference_mode()
	def numpy2pytorch(imgs):
	h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.0 - 1.0 # so that 127 must be strictly 0.0
	h = h.movedim(-1, 1)
	return h.to(device=device, dtype=dtype)


	def resize_and_center_crop(image, target_width, target_height):
	pil_image = Image.fromarray(image)
	original_width, original_height = pil_image.size
	scale_factor = max(target_width / original_width, target_height / original_height)
	resized_width = int(round(original_width * scale_factor))
	resized_height = int(round(original_height * scale_factor))
	resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
	left = (resized_width - target_width) / 2
	top = (resized_height - target_height) / 2
	right = (resized_width + target_width) / 2
	bottom = (resized_height + target_height) / 2
	cropped_image = resized_image.crop((left, top, right, bottom))
	return np.array(cropped_image)


	def resize_without_crop(image, target_width, target_height):
	pil_image = Image.fromarray(image)
	resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
	return np.array(resized_image)


	@spaces.GPU
	@torch.inference_mode()
	def run_rmbg(image):
	clear_memory()
	image_size = image.size
	input_images = transform_image(image).unsqueeze(0).to(device, dtype=torch.float32)
	# Prediction
	with torch.no_grad():
	preds = rmbg(input_images)[-1].sigmoid().cpu()
	pred = preds[0].squeeze()
	pred_pil = transforms.ToPILImage()(pred)
	mask = pred_pil.resize(image_size)
	image.putalpha(mask)
	return image



	def preprocess_image(image: Image.Image, height=768, width=768):
	image = np.array(image)
	alpha = image[..., 3] > 0
	H, W = alpha.shape
	# get the bounding box of alpha
	y, x = np.where(alpha)
	y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
	x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
	image_center = image[y0:y1, x0:x1]
	# resize the longer side to# resize the longer side to H * 0.9
	H, W, _ = image_center.shape
	if H > W:
	W = int(W * (height * 0.9) / H)
	H = int(height * 0.9)
	else:
	H = int(H * (width * 0.9) / W)
	W = int(width * 0.9)
	image_center = np.array(Image.fromarray(image_center).resize((W, H)))
	# pad to H, W
	start_h = (height - H) // 2
	start_w = (width - W) // 2
	image = np.zeros((height, width, 4), dtype=np.uint8)
	image[start_h : start_h + H, start_w : start_w + W] = image_center
	image = image.astype(np.float32) / 255.0
	image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
	image = (image * 255).clip(0, 255).astype(np.uint8)
	image = Image.fromarray(image)
	return image


	@spaces.GPU(duration=60)
	@torch.inference_mode()
	def process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
	clear_memory()

	# Get input dimensions
	input_height, input_width = input_fg.shape[:2]

	bg_source = BGSource(bg_source)

	# if bg_source == BGSource.NONE:
	# pass
	if bg_source == BGSource.UPLOAD:
	pass
	elif bg_source == BGSource.UPLOAD_FLIP:
	input_bg = np.fliplr(input_bg)
	if bg_source == BGSource.GREY:
	input_bg = np.zeros(shape=(input_height, input_width, 3), dtype=np.uint8) + 64
	elif bg_source == BGSource.LEFT:
	gradient = np.linspace(255, 0, input_width)
	image = np.tile(gradient, (input_height, 1))
	input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
	elif bg_source == BGSource.RIGHT:
	gradient = np.linspace(0, 255, input_width)
	image = np.tile(gradient, (input_height, 1))
	input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
	elif bg_source == BGSource.TOP:
	gradient = np.linspace(255, 0, input_height)[:, None]
	image = np.tile(gradient, (1, input_width))
	input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
	elif bg_source == BGSource.BOTTOM:
	gradient = np.linspace(0, 255, input_height)[:, None]
	image = np.tile(gradient, (1, input_width))
	input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
	else:
	raise 'Wrong initial latent!'

	rng = torch.Generator(device=device).manual_seed(int(seed))

	# Use input dimensions directly
	fg = resize_without_crop(input_fg, input_width, input_height)

	concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
	concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor

	conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt)

	if input_bg is None:
	latents = t2i_pipe(
	prompt_embeds=conds,
	negative_prompt_embeds=unconds,
	width=input_width,
	height=input_height,
	num_inference_steps=steps,
	num_images_per_prompt=num_samples,
	generator=rng,
	output_type='latent',
	guidance_scale=cfg,
	cross_attention_kwargs={'concat_conds': concat_conds},
	).images.to(vae.dtype) / vae.config.scaling_factor
	else:
	bg = resize_without_crop(input_bg, input_width, input_height)
	bg_latent = numpy2pytorch([bg]).to(device=vae.device, dtype=vae.dtype)
	bg_latent = vae.encode(bg_latent).latent_dist.mode() * vae.config.scaling_factor
	latents = i2i_pipe(
	image=bg_latent,
	strength=lowres_denoise,
	prompt_embeds=conds,
	negative_prompt_embeds=unconds,
	width=input_width,
	height=input_height,
	num_inference_steps=int(round(steps / lowres_denoise)),
	num_images_per_prompt=num_samples,
	generator=rng,
	output_type='latent',
	guidance_scale=cfg,
	cross_attention_kwargs={'concat_conds': concat_conds},
	).images.to(vae.dtype) / vae.config.scaling_factor

	pixels = vae.decode(latents).sample
	pixels = pytorch2numpy(pixels)
	pixels = [resize_without_crop(
	image=p,
	target_width=int(round(input_width * highres_scale / 64.0) * 64),
	target_height=int(round(input_height * highres_scale / 64.0) * 64))
	for p in pixels]

	pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype)
	latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor
	latents = latents.to(device=unet.device, dtype=unet.dtype)

	highres_height, highres_width = latents.shape[2] * 8, latents.shape[3] * 8

	fg = resize_without_crop(input_fg, highres_width, highres_height)
	concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
	concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor

	latents = i2i_pipe(
	image=latents,
	strength=highres_denoise,
	prompt_embeds=conds,
	negative_prompt_embeds=unconds,
	width=highres_width,
	height=highres_height,
	num_inference_steps=int(round(steps / highres_denoise)),
	num_images_per_prompt=num_samples,
	generator=rng,
	output_type='latent',
	guidance_scale=cfg,
	cross_attention_kwargs={'concat_conds': concat_conds},
	).images.to(vae.dtype) / vae.config.scaling_factor

	pixels = vae.decode(latents).sample
	pixels = pytorch2numpy(pixels)

	# Resize back to input dimensions
	pixels = [resize_without_crop(p, input_width, input_height) for p in pixels]
	pixels = np.stack(pixels)

	return pixels

	@spaces.GPU(duration=60)
	@torch.inference_mode()
	def extract_foreground(image):
	if image is None:
	return None, gr.update(visible=True), gr.update(visible=True)
	clear_memory()
	#logging.info(f"Input image shape: {image.shape}, dtype: {image.dtype}")
	#result, rgba = run_rmbg(image)
	result = run_rmbg(image)
	result = preprocess_image(result)
	#logging.info(f"Result shape: {result.shape}, dtype: {result.dtype}")
	#logging.info(f"RGBA shape: {rgba.shape}, dtype: {rgba.dtype}")
	return result, gr.update(visible=True), gr.update(visible=True)

	def update_extracted_fg_height(selected_image: gr.SelectData):
	if selected_image:
	# Get the height of the selected image
	height = selected_image.value['image']['shape'][0] # Assuming the image is in numpy format
	return gr.update(height=height) # Update the height of extracted_fg
	return gr.update(height=480) # Default height if no image is selected



	@torch.inference_mode()
	def process_relight(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
	# Convert input foreground from PIL to NumPy array if it's in PIL format
	if isinstance(input_fg, Image.Image):
	input_fg = np.array(input_fg)
	logging.info(f"Input foreground shape: {input_fg.shape}, dtype: {input_fg.dtype}")
	results = process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source)
	logging.info(f"Results shape: {results.shape}, dtype: {results.dtype}")
	return results

	quick_prompts = [
	'sunshine from window',
	'golden time',
	'natural lighting',
	'warm atmosphere, at home, bedroom',
	'shadow from window',
	'soft studio lighting',
	'home atmosphere, cozy bedroom illumination',
	]
	quick_prompts = [[x] for x in quick_prompts]

	quick_subjects = [
	'modern sofa, high quality leather',
	'elegant dining table, polished wood',
	'luxurious bed, premium mattress',
	'minimalist office desk, clean design',
	'vintage wooden cabinet, antique finish',
	]
	quick_subjects = [[x] for x in quick_subjects]

	class BGSource(Enum):
	UPLOAD = "Use Background Image"
	UPLOAD_FLIP = "Use Flipped Background Image"
	# NONE = "None"
	LEFT = "Left Light"
	RIGHT = "Right Light"
	TOP = "Top Light"
	BOTTOM = "Bottom Light"
	GREY = "Ambient"

	# Add save function
	def save_images(images, prefix="relight"):
	# Create output directory if it doesn't exist
	output_dir = Path("outputs")
	output_dir.mkdir(exist_ok=True)

	# Create timestamp for unique filenames
	timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

	saved_paths = []
	for i, img in enumerate(images):
	if isinstance(img, np.ndarray):
	# Convert to PIL Image if numpy array
	img = Image.fromarray(img)

	# Create filename with timestamp
	filename = f"{prefix}_{timestamp}_{i+1}.png"
	filepath = output_dir / filename

	# Save image
	img.save(filepath)

	# print(f"Saved {len(saved_paths)} images to {output_dir}")
	return saved_paths

	class MaskMover:
	def __init__(self):
	self.extracted_fg = None
	self.original_fg = None # Store original foreground

	def set_extracted_fg(self, fg_image):
	"""Store the extracted foreground with alpha channel"""
	if isinstance(fg_image, np.ndarray):
	self.extracted_fg = fg_image.copy()
	self.original_fg = fg_image.copy()
	else:
	self.extracted_fg = np.array(fg_image)
	self.original_fg = np.array(fg_image)
	return self.extracted_fg

	def create_composite(self, background, x_pos, y_pos, scale=1.0):
	"""Create composite with foreground at specified position"""
	if self.original_fg is None or background is None:
	return background

	# Convert inputs to PIL Images
	if isinstance(background, np.ndarray):
	bg = Image.fromarray(background).convert('RGBA')
	else:
	bg = background.convert('RGBA')

	if isinstance(self.original_fg, np.ndarray):
	fg = Image.fromarray(self.original_fg).convert('RGBA')
	else:
	fg = self.original_fg.convert('RGBA')

	# Scale the foreground size
	new_width = int(fg.width * scale)
	new_height = int(fg.height * scale)
	fg = fg.resize((new_width, new_height), Image.LANCZOS)

	# Center the scaled foreground at the position
	x = int(x_pos - new_width / 2)
	y = int(y_pos - new_height / 2)

	# Create composite
	result = bg.copy()
	result.paste(fg, (x, y), fg) # Use fg as the mask (requires fg to be in 'RGBA' mode)

	return np.array(result.convert('RGB')) # Convert back to 'RGB' if needed

	def get_depth(image):
	if image is None:
	return None
	# Convert from PIL/gradio format to cv2
	raw_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
	# Get depth map
	depth = model.infer_image(raw_img) # HxW raw depth map
	# Normalize depth for visualization
	depth = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8)
	# Convert to RGB for display
	depth_colored = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
	depth_colored = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB)
	return Image.fromarray(depth_colored)

	from PIL import Image

	def compress_image(image):
	# Convert Gradio image (numpy array) to PIL Image
	img = Image.fromarray(image)

	# Resize image if dimensions are too large
	max_size = 1024 # Maximum dimension size
	if img.width > max_size or img.height > max_size:
	ratio = min(max_size/img.width, max_size/img.height)
	new_size = (int(img.width * ratio), int(img.height * ratio))
	img = img.resize(new_size, Image.Resampling.LANCZOS)

	quality = 95 # Start with high quality
	img.save("compressed_image.jpg", "JPEG", quality=quality) # Initial save

	# Check file size and adjust quality if necessary
	while os.path.getsize("compressed_image.jpg") > 100 * 1024: # 100KB limit
	quality -= 5 # Decrease quality
	img.save("compressed_image.jpg", "JPEG", quality=quality)
	if quality < 20: # Prevent quality from going too low
	break

	# Convert back to numpy array for Gradio
	compressed_img = np.array(Image.open("compressed_image.jpg"))
	return compressed_img

	def use_orientation(selected_image:gr.SelectData):
	return selected_image.value['image']['path']



	def generate_description(object_description,image, detail="high", max_tokens=75):
	openai_api_key = os.getenv("OPENAI_API_KEY")
	client = OpenAI(api_key=openai_api_key)

	if image is not None:
	try:
	img = image # No need to open, directly use the PIL Image object

	buffered = io.BytesIO()
	img.save(buffered, format=IMAGE_FORMAT)
	img_base64 = base64.b64encode(buffered.getvalue()).decode()

	prompt = f"As if you were describing the interior design, make a detailed caption of this image in one paragraph. Highlighting textures, furnitures, locations. This object should be included in the description :{object_description}"

	payload = {
	"model": "gpt-4o-mini",
	"messages": [{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url",
	"image_url": {"url": f"data:image/jpeg;base64,{img_base64}", "detail": detail}}
	]
	}],
	"max_tokens": max_tokens
	}

	response = client.chat.completions.create(**payload)
	return response.choices[0].message.content
	except Exception as e:
	print(e)
	else:
	try:
	prompt = f"Description: {object_description}. As if you were designing an interior, improve this sentence in one large paragraph. Highlighting textures, furnitures, locations, such that you create a coherent, visually pleasing setting."

	payload = {
	"model": "gpt-4o-mini",
	"messages": [{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	]
	}],
	"max_tokens": max_tokens
	}

	response = client.chat.completions.create(**payload)
	return response.choices[0].message.content
	except Exception as e:
	print(e)


	@spaces.GPU(duration=60)
	@torch.inference_mode
	def process_image(input_image, input_text):
	"""Main processing function for the Gradio interface"""

	if isinstance(input_image, Image.Image):
	input_image = np.array(input_image)


	clear_memory()

	# Initialize configs
	API_TOKEN = "9c8c865e10ec1821bea79d9fa9dc8720"
	SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt"
	SAM2_MODEL_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs/sam2_hiera_l.yaml")
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	OUTPUT_DIR = Path("outputs/grounded_sam2_dinox_demo")
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	HEIGHT = 768
	WIDTH = 768

	# Initialize DDS client
	config = Config(API_TOKEN)
	client = Client(config)

	# Process classes from text prompt
	classes = [x.strip().lower() for x in input_text.split('.') if x]
	class_name_to_id = {name: id for id, name in enumerate(classes)}
	class_id_to_name = {id: name for name, id in class_name_to_id.items()}

	# Save input image to temp file and get URL
	with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmpfile:
	cv2.imwrite(tmpfile.name, input_image)
	image_url = client.upload_file(tmpfile.name)
	os.remove(tmpfile.name)

	# Process detection results
	input_boxes = []
	masks = []
	confidences = []
	class_names = []
	class_ids = []

	if len(input_text) == 0:
	task = DinoxTask(
	image_url=image_url,
	prompts=[TextPrompt(text="<prompt_free>")],
	# targets=[DetectionTarget.BBox, DetectionTarget.Mask]
	)

	client.run_task(task)
	predictions = task.result.objects
	classes = [pred.category for pred in predictions]
	classes = list(set(classes))
	class_name_to_id = {name: id for id, name in enumerate(classes)}
	class_id_to_name = {id: name for name, id in class_name_to_id.items()}

	for idx, obj in enumerate(predictions):
	input_boxes.append(obj.bbox)
	masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size)) # convert mask to np.array using DDS API
	confidences.append(obj.score)
	cls_name = obj.category.lower().strip()
	class_names.append(cls_name)
	class_ids.append(class_name_to_id[cls_name])

	boxes = np.array(input_boxes)
	masks = np.array(masks)
	class_ids = np.array(class_ids)
	labels = [
	f"{class_name} {confidence:.2f}"
	for class_name, confidence
	in zip(class_names, confidences)
	]
	detections = sv.Detections(
	xyxy=boxes,
	mask=masks.astype(bool),
	class_id=class_ids
	)

	box_annotator = sv.BoxAnnotator()
	label_annotator = sv.LabelAnnotator()
	mask_annotator = sv.MaskAnnotator()

	annotated_frame = input_image.copy()
	annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
	annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
	annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)

	# Create transparent mask for first detected object
	if len(detections) > 0:
	# Get first mask
	first_mask = detections.mask[0]

	# Get original RGB image
	img = input_image.copy()
	H, W, C = img.shape

	# Create RGBA image with default 255 alpha
	alpha = np.zeros((H, W, 1), dtype=np.uint8)
	alpha[~first_mask] = 0 # 128 # for semi-transparency background
	alpha[first_mask] = 255 # Make the foreground opaque
	alpha = alpha.squeeze(-1) # Remove singleton dimension to become 2D
	rgba = np.dstack((img, alpha)).astype(np.uint8)

	# get the bounding box of alpha
	y, x = np.where(alpha > 0)
	y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
	x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)

	image_center = rgba[y0:y1, x0:x1]
	# resize the longer side to H * 0.9
	H, W, _ = image_center.shape
	if H > W:
	W = int(W * (HEIGHT * 0.9) / H)
	H = int(HEIGHT * 0.9)
	else:
	H = int(H * (WIDTH * 0.9) / W)
	W = int(WIDTH * 0.9)

	image_center = np.array(Image.fromarray(image_center).resize((W, H), Image.LANCZOS))
	# pad to H, W
	start_h = (HEIGHT - H) // 2
	start_w = (WIDTH - W) // 2
	image = np.zeros((HEIGHT, WIDTH, 4), dtype=np.uint8)
	image[start_h : start_h + H, start_w : start_w + W] = image_center
	image = image.astype(np.float32) / 255.0
	image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
	image = (image * 255).clip(0, 255).astype(np.uint8)
	image = Image.fromarray(image)

	return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
	return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
	else:
	# Run DINO-X detection
	task = DinoxTask(
	image_url=image_url,
	prompts=[TextPrompt(text=input_text)],
	targets=[DetectionTarget.BBox, DetectionTarget.Mask]
	)

	client.run_task(task)
	result = task.result
	objects = result.objects

	predictions = task.result.objects
	classes = [x.strip().lower() for x in input_text.split('.') if x]
	class_name_to_id = {name: id for id, name in enumerate(classes)}
	class_id_to_name = {id: name for name, id in class_name_to_id.items()}

	boxes = []
	masks = []
	confidences = []
	class_names = []
	class_ids = []

	for idx, obj in enumerate(predictions):
	boxes.append(obj.bbox)
	masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size)) # convert mask to np.array using DDS API
	confidences.append(obj.score)
	cls_name = obj.category.lower().strip()
	class_names.append(cls_name)
	class_ids.append(class_name_to_id[cls_name])

	boxes = np.array(boxes)
	masks = np.array(masks)
	class_ids = np.array(class_ids)
	labels = [
	f"{class_name} {confidence:.2f}"
	for class_name, confidence
	in zip(class_names, confidences)
	]

	detections = sv.Detections(
	xyxy=boxes,
	mask=masks.astype(bool),
	class_id=class_ids,
	)

	box_annotator = sv.BoxAnnotator()
	label_annotator = sv.LabelAnnotator()
	mask_annotator = sv.MaskAnnotator()

	annotated_frame = input_image.copy()
	annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
	annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
	annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)

	# Create transparent mask for first detected object
	if len(detections) > 0:
	# Get first mask
	first_mask = detections.mask[0]

	# Get original RGB image
	img = input_image.copy()
	H, W, C = img.shape

	# Create RGBA image with default 255 alpha
	alpha = np.zeros((H, W, 1), dtype=np.uint8)
	alpha[~first_mask] = 0 # 128 for semi-transparency background
	alpha[first_mask] = 255 # Make the foreground opaque
	alpha = alpha.squeeze(-1) # Remove singleton dimension to become 2D
	rgba = np.dstack((img, alpha)).astype(np.uint8)
	# get the bounding box of alpha
	y, x = np.where(alpha > 0)
	y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
	x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)

	image_center = rgba[y0:y1, x0:x1]
	# resize the longer side to H * 0.9
	H, W, _ = image_center.shape
	if H > W:
	W = int(W * (HEIGHT * 0.9) / H)
	H = int(HEIGHT * 0.9)
	else:
	H = int(H * (WIDTH * 0.9) / W)
	W = int(WIDTH * 0.9)

	image_center = np.array(Image.fromarray(image_center).resize((W, H), Image.LANCZOS))
	# pad to H, W
	start_h = (HEIGHT - H) // 2
	start_w = (WIDTH - W) // 2
	image = np.zeros((HEIGHT, WIDTH, 4), dtype=np.uint8)
	image[start_h : start_h + H, start_w : start_w + W] = image_center
	image = image.astype(np.float32) / 255.0
	image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
	image = (image * 255).clip(0, 255).astype(np.uint8)
	image = Image.fromarray(image)

	return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
	return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)



	# Import all the necessary functions from the original script
	def get_value_at_index(obj: Union[Sequence, Mapping], index: int) -> Any:
	try:
	return obj[index]
	except KeyError:
	return obj["result"][index]

	# Add all the necessary setup functions from the original script
	def find_path(name: str, path: str = None) -> str:
	if path is None:
	path = os.getcwd()
	if name in os.listdir(path):
	path_name = os.path.join(path, name)
	print(f"{name} found: {path_name}")
	return path_name
	parent_directory = os.path.dirname(path)
	if parent_directory == path:
	return None
	return find_path(name, parent_directory)

	def add_comfyui_directory_to_sys_path() -> None:
	comfyui_path = find_path("ComfyUI")
	if comfyui_path is not None and os.path.isdir(comfyui_path):
	sys.path.append(comfyui_path)
	print(f"'{comfyui_path}' added to sys.path")

	def add_extra_model_paths() -> None:
	try:
	from main import load_extra_path_config
	except ImportError:
	from utils.extra_config import load_extra_path_config
	extra_model_paths = find_path("extra_model_paths.yaml")
	if extra_model_paths is not None:
	load_extra_path_config(extra_model_paths)
	else:
	print("Could not find the extra_model_paths config file.")

	# Initialize paths
	add_comfyui_directory_to_sys_path()
	add_extra_model_paths()

	def import_custom_nodes() -> None:
	import asyncio
	import execution
	from nodes import init_extra_nodes
	import server
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	server_instance = server.PromptServer(loop)
	execution.PromptQueue(server_instance)
	init_extra_nodes()

	#Import all necessary nodes
	from nodes import (
	StyleModelLoader,
	VAEEncode,
	NODE_CLASS_MAPPINGS,
	LoadImage,
	CLIPVisionLoader,
	SaveImage,
	VAELoader,
	CLIPVisionEncode,
	DualCLIPLoader,
	EmptyLatentImage,
	VAEDecode,
	UNETLoader,
	CLIPTextEncode,
	)

	#Initialize all constant nodes and models in global context
	import_custom_nodes()

	#Global variables for preloaded models and constants
	with torch.inference_mode():
	#Initialize constants
	intconstant = NODE_CLASS_MAPPINGS["INTConstant"]()
	CONST_1024 = intconstant.get_value(value=1024)

	# Load CLIP
	dualcliploader = DualCLIPLoader()
	CLIP_MODEL = dualcliploader.load_clip(
	clip_name1="t5/t5xxl_fp16.safetensors",
	clip_name2="clip_l.safetensors",
	type="flux",
	)

	# Load VAE
	vaeloader = VAELoader()
	VAE_MODEL = vaeloader.load_vae(vae_name="FLUX1/ae.safetensors")

	# Load UNET
	unetloader = UNETLoader()
	UNET_MODEL = unetloader.load_unet(
	unet_name="flux1-depth-dev.safetensors", weight_dtype="default"
	)

	# Load CLIP Vision
	clipvisionloader = CLIPVisionLoader()
	CLIP_VISION_MODEL = clipvisionloader.load_clip(
	clip_name="sigclip_vision_patch14_384.safetensors"
	)

	# Load Style Model
	stylemodelloader = StyleModelLoader()
	STYLE_MODEL = stylemodelloader.load_style_model(
	style_model_name="flux1-redux-dev.safetensors"
	)

	# Initialize samplers
	ksamplerselect = NODE_CLASS_MAPPINGS["KSamplerSelect"]()
	SAMPLER = ksamplerselect.get_sampler(sampler_name="euler")

	# Initialize depth model
	cr_clip_input_switch = NODE_CLASS_MAPPINGS["CR Clip Input Switch"]()
	downloadandloaddepthanythingv2model = NODE_CLASS_MAPPINGS["DownloadAndLoadDepthAnythingV2Model"]()
	DEPTH_MODEL = downloadandloaddepthanythingv2model.loadmodel(
	model="depth_anything_v2_vitl_fp32.safetensors"
	)
	cliptextencode = CLIPTextEncode()
	loadimage = LoadImage()
	vaeencode = VAEEncode()
	fluxguidance = NODE_CLASS_MAPPINGS["FluxGuidance"]()
	instructpixtopixconditioning = NODE_CLASS_MAPPINGS["InstructPixToPixConditioning"]()
	clipvisionencode = CLIPVisionEncode()
	stylemodelapplyadvanced = NODE_CLASS_MAPPINGS["StyleModelApplyAdvanced"]()
	emptylatentimage = EmptyLatentImage()
	basicguider = NODE_CLASS_MAPPINGS["BasicGuider"]()
	basicscheduler = NODE_CLASS_MAPPINGS["BasicScheduler"]()
	randomnoise = NODE_CLASS_MAPPINGS["RandomNoise"]()
	samplercustomadvanced = NODE_CLASS_MAPPINGS["SamplerCustomAdvanced"]()
	vaedecode = VAEDecode()
	cr_text = NODE_CLASS_MAPPINGS["CR Text"]()
	saveimage = SaveImage()
	getimagesizeandcount = NODE_CLASS_MAPPINGS["GetImageSizeAndCount"]()
	depthanything_v2 = NODE_CLASS_MAPPINGS["DepthAnything_V2"]()
	imageresize = NODE_CLASS_MAPPINGS["ImageResize+"]()


	@spaces.GPU
	@torch.inference_mode
	def generate_image(prompt, structure_image, style_image, depth_strength=15, style_strength=0.5, progress=gr.Progress(track_tqdm=True)) -> str:
	"""Main generation function that processes inputs and returns the path to the generated image."""

	clear_memory()
	with torch.inference_mode():
	# Set up CLIP
	clip_switch = cr_clip_input_switch.switch(
	Input=1,
	clip1=get_value_at_index(CLIP_MODEL, 0),
	clip2=get_value_at_index(CLIP_MODEL, 0),
	)

	# Encode text
	text_encoded = cliptextencode.encode(
	text=prompt,
	clip=get_value_at_index(clip_switch, 0),
	)
	empty_text = cliptextencode.encode(
	text="",
	clip=get_value_at_index(clip_switch, 0),
	)

	# Process structure image
	structure_img = loadimage.load_image(image=structure_image)

	# Resize image
	resized_img = imageresize.execute(
	width=get_value_at_index(CONST_1024, 0),
	height=get_value_at_index(CONST_1024, 0),
	interpolation="bicubic",
	method="keep proportion",
	condition="always",
	multiple_of=16,
	image=get_value_at_index(structure_img, 0),
	)

	# Get image size
	size_info = getimagesizeandcount.getsize(
	image=get_value_at_index(resized_img, 0)
	)

	# Encode VAE
	vae_encoded = vaeencode.encode(
	pixels=get_value_at_index(size_info, 0),
	vae=get_value_at_index(VAE_MODEL, 0),
	)

	# Process depth
	depth_processed = depthanything_v2.process(
	da_model=get_value_at_index(DEPTH_MODEL, 0),
	images=get_value_at_index(size_info, 0),
	)

	# Apply Flux guidance
	flux_guided = fluxguidance.append(
	guidance=depth_strength,
	conditioning=get_value_at_index(text_encoded, 0),
	)

	# Process style image
	style_img = loadimage.load_image(image=style_image)

	# Encode style with CLIP Vision
	style_encoded = clipvisionencode.encode(
	crop="center",
	clip_vision=get_value_at_index(CLIP_VISION_MODEL, 0),
	image=get_value_at_index(style_img, 0),
	)

	# Set up conditioning
	conditioning = instructpixtopixconditioning.encode(
	positive=get_value_at_index(flux_guided, 0),
	negative=get_value_at_index(empty_text, 0),
	vae=get_value_at_index(VAE_MODEL, 0),
	pixels=get_value_at_index(depth_processed, 0),
	)

	# Apply style
	style_applied = stylemodelapplyadvanced.apply_stylemodel(
	strength=style_strength,
	conditioning=get_value_at_index(conditioning, 0),
	style_model=get_value_at_index(STYLE_MODEL, 0),
	clip_vision_output=get_value_at_index(style_encoded, 0),
	)

	# Set up empty latent
	empty_latent = emptylatentimage.generate(
	width=get_value_at_index(resized_img, 1),
	height=get_value_at_index(resized_img, 2),
	batch_size=1,
	)

	# Set up guidance
	guided = basicguider.get_guider(
	model=get_value_at_index(UNET_MODEL, 0),
	conditioning=get_value_at_index(style_applied, 0),
	)

	# Set up scheduler
	schedule = basicscheduler.get_sigmas(
	scheduler="simple",
	steps=28,
	denoise=1,
	model=get_value_at_index(UNET_MODEL, 0),
	)

	# Generate random noise
	noise = randomnoise.get_noise(noise_seed=random.randint(1, 2**64))

	# Sample
	sampled = samplercustomadvanced.sample(
	noise=get_value_at_index(noise, 0),
	guider=get_value_at_index(guided, 0),
	sampler=get_value_at_index(SAMPLER, 0),
	sigmas=get_value_at_index(schedule, 0),
	latent_image=get_value_at_index(empty_latent, 0),
	)

	# Decode VAE
	decoded = vaedecode.decode(
	samples=get_value_at_index(sampled, 0),
	vae=get_value_at_index(VAE_MODEL, 0),
	)

	# Save image
	prefix = cr_text.text_multiline(text="Flux_BFL_Depth_Redux")

	saved = saveimage.save_images(
	filename_prefix=get_value_at_index(prefix, 0),
	images=get_value_at_index(decoded, 0),
	)
	saved_path = f"output/{saved['ui']['images'][0]['filename']}"

	clear_memory()
	return saved_path

	# Create Gradio interface

	examples = [
	["", "chair_input_1.jpg", "chair_input_2.png", 15, 0.5],
	]

	output_image = gr.Image(label="Generated Image")

	with gr.Blocks() as app:
	with gr.Tab("Relighting"):
	with gr.Row():
	gr.Markdown("## Product Placement from Text")
	with gr.Row():
	with gr.Column():
	with gr.Row():
	input_fg = gr.Image(type="pil", label="Image", height=480)
	with gr.Row():
	with gr.Group():
	find_objects_button = gr.Button(value="Segment Object from text")
	text_prompt = gr.Textbox(
	label="Text Prompt",
	placeholder="Enter object classes separated by periods (e.g. 'car . person .'), leave empty to get all objects",
	value=""
	)
	extract_button = gr.Button(value="Remove Background")
	with gr.Row():
	extracted_objects = gr.Image(type="numpy", label="Extracted Foreground", height=480)
	extracted_fg = gr.Image(type="pil", label="Extracted Foreground", height=480)
	angles_fg = gr.Image(type="pil", label="Converted Foreground", height=480, visible=False)



	with gr.Accordion(label="Alternative Angles", open=False) as alternative_angles:
	with gr.Group():
	run_button = gr.Button("Generate alternative angles")
	orientation_result = gr.Gallery(
	label="Result",
	show_label=False,
	columns=[3],
	rows=[2],
	object_fit="fill",
	height="auto",
	allow_preview=False,
	)

	if orientation_result:
	orientation_result.select(use_orientation, inputs=None, outputs=extracted_fg)

	dummy_image_for_outputs = gr.Image(visible=False, label='Result', type='pil')
	dummy_image_for_prompt_augmentation = gr.Image(type="pil", label="Dummy image", height=48, visible=False)


	with gr.Column():

	with gr.Row():
	with gr.Column(4):
	result_gallery = gr.Gallery(height=832, label='Outputs', object_fit='contain', selected_index=0)
	if result_gallery:
	result_gallery.select(use_orientation, inputs=None, outputs=dummy_image_for_outputs)
	with gr.Column(1):
	with gr.Group():
	gr.Markdown("Outpaint")
	with gr.Row():
	with gr.Column(scale=2):
	prompt_fill = gr.Textbox(label="Prompt (Optional)")
	with gr.Column(scale=1):
	fill_button = gr.Button("Generate")
	target_ratio = gr.Radio(
	label="Image Ratio",
	choices=["9:16", "16:9", "1:1", "Custom"],
	value="9:16",
	scale=3
	)
	alignment_dropdown = gr.Dropdown(
	choices=["Middle", "Left", "Right", "Top", "Bottom"],
	value="Middle",
	label="Alignment",
	)
	resize_option = gr.Radio(
	label="Resize input image",
	choices=["Full", "75%", "50%", "33%", "25%", "Custom"],
	value="75%"
	)
	custom_resize_percentage = gr.Slider(
	label="Custom resize (%)",
	minimum=1,
	maximum=100,
	step=1,
	value=50,
	visible=False
	)

	fill_result = gr.Image(
	interactive=False,
	label="Generated Image",
	)

	with gr.Accordion(label="Advanced settings", open=False) as settings_panel:
	with gr.Column():
	with gr.Row():
	width_slider = gr.Slider(
	label="Target Width",
	minimum=720,
	maximum=1536,
	step=8,
	value=720,
	)
	height_slider = gr.Slider(
	label="Target Height",
	minimum=720,
	maximum=1536,
	step=8,
	value=1280,
	)

	num_inference_steps = gr.Slider(label="Steps", minimum=2, maximum=50, step=1, value=18)
	with gr.Group():
	overlap_percentage = gr.Slider(
	label="Mask overlap (%)",
	minimum=1,
	maximum=50,
	value=10,
	step=1
	)
	with gr.Row():
	overlap_top = gr.Checkbox(label="Overlap Top", value=True)
	overlap_right = gr.Checkbox(label="Overlap Right", value=True)
	with gr.Row():
	overlap_left = gr.Checkbox(label="Overlap Left", value=True)
	overlap_bottom = gr.Checkbox(label="Overlap Bottom", value=True)


	with gr.Row():
	with gr.Group():
	with gr.Column():
	prompt = gr.Textbox(label="Prompt")
	with gr.Column():
	augment_prompt = gr.Button(value='Improve Prompt (+)')

	relight_button = gr.Button(value="Relight")

	cfg = gr.Slider(label="Fidelity", minimum=0.1, maximum=5.0, value=2, step=0.01, visible=True)

	bg_source = gr.Radio(choices=[e.value for e in list(BGSource)[2:]],
	value=BGSource.LEFT.value,
	label="Lighting Preference (Initial Latent)", type='value')

	example_quick_subjects = gr.Dataset(samples=quick_subjects, label='Subject Quick List', samples_per_page=1000, components=[prompt])
	example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Lighting Quick List', samples_per_page=1000, components=[prompt])

	with gr.Group(visible=False):
	with gr.Row():
	num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
	seed = gr.Number(label="Seed", value=12345, precision=0)

	with gr.Row():
	image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64)
	image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64)

	with gr.Accordion("Advanced options", open=False):
	steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=15, step=1)
	lowres_denoise = gr.Slider(label="Lowres Denoise (for initial latent)", minimum=0.1, maximum=1.0, value=0.9, step=0.01)
	highres_scale = gr.Slider(label="Highres Scale", minimum=1.0, maximum=3.0, value=1.5, step=0.01)
	highres_denoise = gr.Slider(label="Highres Denoise", minimum=0.1, maximum=1.0, value=0.5, step=0.01)
	a_prompt = gr.Textbox(label="Added Prompt", value='best quality', visible=False)
	n_prompt = gr.Textbox(label="Negative Prompt", value='lowres, bad anatomy, bad hands, cropped, worst quality', visible=False)
	x_slider = gr.Slider(
	minimum=0,
	maximum=1000,
	label="X Position",
	value=500,
	visible=False
	)
	y_slider = gr.Slider(
	minimum=0,
	maximum=1000,
	label="Y Position",
	value=500,
	visible=False
	)

	# with gr.Row():

	# gr.Examples(
	# fn=lambda *args: ([args[-1]], None),
	# examples=db_examples.foreground_conditioned_examples,
	# inputs=[
	# input_fg, prompt, bg_source, image_width, image_height, seed, dummy_image_for_outputs
	# ],
	# outputs=[result_gallery, output_bg],
	# run_on_click=True, examples_per_page=1024
	# )

	def move_prompt(prompt):
	return prompt

	augment_prompt.click(generate_description, inputs=[prompt, extracted_fg], outputs=[prompt]).then(move_prompt, [prompt], [prompt_fill])
	ips = [extracted_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source]

	relight_button.click(fn=process_relight, inputs=ips, outputs=[result_gallery])#.then(clear_memory, inputs=[], outputs=[])
	example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False)
	example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False)

	# def use_output_as_input(output_image):
	# return output_image

	# use_as_input_button.click(
	# fn=use_output_as_input,
	# inputs=[fill_result],
	# outputs=[input_image]
	# )

	target_ratio.change(
	fn=preload_presets,
	inputs=[target_ratio, width_slider, height_slider],
	outputs=[width_slider, height_slider, settings_panel],
	queue=False
	)

	width_slider.change(
	fn=select_the_right_preset,
	inputs=[width_slider, height_slider],
	outputs=[target_ratio],
	queue=False
	)

	height_slider.change(
	fn=select_the_right_preset,
	inputs=[width_slider, height_slider],
	outputs=[target_ratio],
	queue=False
	)

	resize_option.change(
	fn=toggle_custom_resize_slider,
	inputs=[resize_option],
	outputs=[custom_resize_percentage],
	queue=False
	)

	# fill_button.click(
	# fn=clear_result,
	# inputs=None,
	# outputs=fill_result,
	# ).then(
	# fn=inpaint,
	# inputs=[dummy_image_for_outputs, width_slider, height_slider, overlap_percentage, num_inference_steps,
	# resize_option, custom_resize_percentage, prompt_fill, alignment_dropdown,
	# overlap_left, overlap_right, overlap_top, overlap_bottom],
	# outputs=[fill_result])
	# ).then(
	# fn=lambda: gr.update(visible=True),
	# inputs=None,
	# outputs=use_as_input_button,
	# )

	fill_button.click(
	fn=clear_result,
	inputs=None,
	outputs=fill_result,
	).then(
	fn=inpaint,
	inputs=[dummy_image_for_outputs, width_slider, height_slider, overlap_percentage, num_inference_steps,
	resize_option, custom_resize_percentage, prompt_fill, alignment_dropdown,
	overlap_left, overlap_right, overlap_top, overlap_bottom],
	outputs=[fill_result])
	# ).then(
	# fn=lambda: gr.update(visible=True),
	# inputs=None,
	# outputs=use_as_input_button,
	# )

	prompt_fill.submit(
	fn=clear_result,
	inputs=None,
	outputs=fill_result,
	).then(
	fn=inpaint,
	inputs=[dummy_image_for_outputs, width_slider, height_slider, overlap_percentage, num_inference_steps,
	resize_option, custom_resize_percentage, prompt_fill, alignment_dropdown,
	overlap_left, overlap_right, overlap_top, overlap_bottom],
	outputs=[fill_result])


	def convert_to_pil(image):
	try:
	#logging.info(f"Input image shape: {image.shape}, dtype: {image.dtype}")
	image = image.astype(np.uint8)
	logging.info(f"Converted image shape: {image.shape}, dtype: {image.dtype}")
	return image
	except Exception as e:
	logging.error(f"Error converting image: {e}")
	return image

	run_button.click(
	fn=convert_to_pil,
	inputs=extracted_fg, # This is already RGBA with removed background
	outputs=angles_fg
	).then(
	fn=infer,
	inputs=[
	text_prompt,
	extracted_fg, # Already processed RGBA image
	],
	outputs=[orientation_result],
	)#.then(clear_memory, inputs=[], outputs=[])

	find_objects_button.click(
	fn=process_image,
	inputs=[input_fg, text_prompt],
	outputs=[extracted_objects, extracted_fg]
	)#.then(clear_memory, inputs=[], outputs=[])

	extract_button.click(
	fn=extract_foreground,
	inputs=[input_fg],
	outputs=[extracted_fg, x_slider, y_slider]
	)#.then(clear_memory, inputs=[], outputs=[])

	with gr.Tab("Style Transfer"):
	gr.Markdown("## Apply the style of an image to another one")
	with gr.Row():
	with gr.Column():
	prompt_input = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...")
	with gr.Row():
	with gr.Group():
	structure_image = gr.Image(label="Structure Image", type="filepath")
	depth_strength = gr.Slider(minimum=0, maximum=50, value=15, label="Depth Strength")
	with gr.Group():
	style_image = gr.Image(label="Style Image", type="filepath")
	style_strength = gr.Slider(minimum=0, maximum=1, value=0.5, label="Style Strength")
	generate_btn = gr.Button("Generate")

	gr.Examples(
	examples=examples,
	inputs=[prompt_input, structure_image, style_image, depth_strength, style_strength],
	outputs=[output_image],
	fn=generate_image,
	cache_examples=True,
	cache_mode="lazy"
	)

	with gr.Column():
	output_image.render()
	transfer_btn = gr.Button("Send to relight")

	with gr.Tab("Caption"):
	with gr.Row():
	gr.Markdown("## Describe Image")
	with gr.Row():
	with gr.Column():
	with gr.Row():
	image_to_describe = gr.Image(type="pil", label="Image", height=480)
	with gr.Row():
	with gr.Group():
	with gr.Column():
	describe_button = gr.Button(value="Describe Image")
	text_to_describe = gr.Textbox(label="Describe object or scene", visible=False)
	description_text = gr.Textbox(
	label="Output",
	placeholder="",
	value=""
	)
	send_to_relight = gr.Button(value="Send to Relight")


	def send_img(img_result):
	return img_result


	send_to_relight.click(move_prompt, [description_text], [prompt])#.then(move_prompt, [description_text], [prompt_fill])

	transfer_btn.click(send_img, [output_image], [input_fg])#.then(clear_memory, inputs=[], outputs=[])

	# describe_button.click(describe_image, [image_to_describe], [description_text])

	describe_button.click(
	fn=generate_description,
	inputs=[text_to_describe,image_to_describe],
	outputs=description_text
	)#.then(clear_memory, inputs=[], outputs=[])

	generate_btn.click(
	fn=generate_image,
	inputs=[prompt_input, structure_image, style_image, depth_strength, style_strength],
	outputs=[output_image]
	)#.then(clear_memory, inputs=[], outputs=[])

	if __name__ == "__main__":
	# app.queue(default_concurrency_limit=3)
	app.launch(share=True)