Spaces:

r3gm
/

multigen

Running on Zero

App Files Files Community

r3gm commited on 10 days ago

Commit

406ea4d

verified ·

1 Parent(s): 09f795e

Upload 5 files

Browse files

Files changed (5) hide show

app.py +177 -47
constants.py +73 -0
pipeline_newbie_img2img.py +563 -0
pre-requirements.txt +1 -0
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -1,20 +1,41 @@
 import gradio as gr
 import spaces
 import torch
 from diffusers import AuraFlowPipeline, Lumina2Pipeline, NewbiePipeline
-from transformers import AutoModel
 import random
 import numpy as np
 import warnings
 warnings.filterwarnings("ignore")
 model_path = "Disty0/NewBie-image-Exp0.1-Diffusers"  # NewBie-AI/NewBie-image-Exp0.1
 text_encoder_2 = AutoModel.from_pretrained(model_path, subfolder="text_encoder_2", trust_remote_code=True, torch_dtype=torch.bfloat16)
 pipe_newbie = NewbiePipeline.from_pretrained(model_path, text_encoder_2=text_encoder_2, torch_dtype=torch.bfloat16)
 pipe_newbie.to("cuda")
 del text_encoder_2
-pipe_newbie.transformer.set_attention_backend("_flash_3_hub")
 pipe_pony = AuraFlowPipeline.from_pretrained("purplesmartai/pony-v7-base", torch_dtype=torch.bfloat16)
 pipe_pony.to("cuda")
@@ -25,13 +46,73 @@ pipe_netayume = Lumina2Pipeline.from_pretrained(
 )
 pipe_netayume.to("cuda")
 @spaces.GPU()
-def generate_image_newbie(prompt, negative_prompt, system_prompt, height, width, num_inference_steps, guidance_scale, cfg_trunc_ratio, cfg_normalization, seed, sigmas_factor, progress=gr.Progress(track_tqdm=True)):
     if seed < 0:
         seed = random.randint(0, 2**32 - 1)
     generator = torch.Generator("cuda").manual_seed(int(seed))
     pipeline_args = {
         "prompt": prompt,
         "negative_prompt": negative_prompt,
@@ -43,18 +124,35 @@ def generate_image_newbie(prompt, negative_prompt, system_prompt, height, width,
         "cfg_trunc_ratio": cfg_trunc_ratio,
         "cfg_normalization": cfg_normalization,
         "generator": generator,
     }
     if sigmas_factor != 1.0:
         steps = int(num_inference_steps)
         sigmas = np.linspace(1.0, 1 / steps, steps)
         sigmas = sigmas * sigmas_factor
-        pipeline_args["sigmas"] = sigmas.tolist()
-    image = pipe_newbie(**pipeline_args).images[0]
     return image, seed
 @spaces.GPU()
 def generate_image_pony(prompt, negative_prompt, height, width, num_inference_steps, guidance_scale, sigmas_factor, seed, progress=gr.Progress(track_tqdm=True)):
     if seed < 0:
@@ -81,6 +179,7 @@ def generate_image_pony(prompt, negative_prompt, height, width, num_inference_st
     image = pipe_pony(**pipeline_args).images[0]
     return image, seed
 @spaces.GPU()
 def generate_image_netayume(prompt, negative_prompt, system_prompt, height, width, guidance_scale, num_inference_steps, cfg_trunc_ratio, cfg_normalization, seed, sigmas_factor, progress=gr.Progress(track_tqdm=True)):
     if seed < 0:
@@ -111,43 +210,29 @@ def generate_image_netayume(prompt, negative_prompt, system_prompt, height, widt
     return image, seed
-newbie_prompt = """<character_1>
-<n>$character_1$</n>
-<gender>1girl, solo</gender>
-<appearance>blonde_hair, long_hair</appearance>
-<clothing>large_hat, white_hat, white_blouse, puffy_sleeves, shoulder_cutout, black_skirt, shirt_tucked_in, socks, shoes</clothing>
-<expression>looking_up</expression>
-<action>sitting, reclining, arm_support, from_side, cowboy_shot, wide_shot</action>
-<position>center</position>
-</character_1>
-<general_tags>
-<count>1girl</count>
-<artists>(kazutake hazano:0.5), (onineko:0.8), (r17329 illu:0.2), (ma1ma1helmes b illu:0.2)</artists>
-<style>masterpiece, best_quality, high_resolution, detailed</style>
-<background>detailed_background, scenery, detailed_background</background>
-<atmosphere>cheerful</atmosphere>
-<lighting>dynamic_angle, depth_of_field, high_contrast, colorful, detailed_light, light_leaks, beautiful_detailed_glow, best_shadow, shiny_skin, cinematic_lighting, ray_tracing, from_above, female_focus, close-up, dutch_angle, blue_archive</lighting>
-<quality>very_aesthetic, masterpiece, no_text</quality>
-<objects>bag</objects>
-<other>2024_year</other>
-</general_tags>"""
 with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as demo:
     gr.Markdown("# Image Generation Playground")
     with gr.Tabs():
         with gr.Tab(label="NewBie Image"):
-            gr.Markdown("## 🆕 NewBie Image Generation")
             with gr.Row(variant="panel"):
                 with gr.Column(scale=2):
                     prompt_newbie = gr.Textbox(
                         label="Prompt",
-                        value=newbie_prompt,
                         lines=3
                     )
                     negative_prompt_newbie = gr.Textbox(
                         label="Negative Prompt",
-                        value="low quality, bad quality, blurry, low resolution, deformed, ugly, bad anatomy",
                         lines=2
                     )
@@ -162,27 +247,39 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
                     )
                     with gr.Row():
-                        height_newbie = gr.Slider(label="Height", minimum=512, maximum=2048, step=64, value=1024)
-                        width_newbie = gr.Slider(label="Width", minimum=512, maximum=2048, step=64, value=1024)
                     with gr.Row():
-                        steps_newbie = gr.Slider(label="Inference Steps", minimum=1, maximum=100, step=1, value=40)
-                        guidance_scale_newbie = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=5.0)
-                    with gr.Row():
-                        cfg_trunc_newbie = gr.Slider(label="CFG Truncation Ratio", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
-                        sigmas_newbie = gr.Slider(label="Sigmas Factor", minimum=0.9, maximum=1.1, step=0.01, value=0.98)
                     with gr.Row():
-                        cfg_norm_newbie = gr.Checkbox(label="CFG Normalization", value=True)
                         seed_newbie = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
                     generate_btn_newbie = gr.Button("Generate", variant="primary")
                 with gr.Column(scale=1):
                     image_output_newbie = gr.Image(label="Generated Image", format="png", interactive=False)
                     used_seed_newbie = gr.Number(label="Used Seed", interactive=False)
         with gr.Tab(label="Pony v7"):
             gr.Markdown("## ✨ Pony v7 AuraFlow")
             gr.Markdown("Generate images from text prompts using the AuraFlow model.")
@@ -191,7 +288,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
                     prompt_pony = gr.Textbox(label="Prompt", value="Score_9, ", lines=3)
                     neg_prompt_pony = gr.Textbox(
                         label="Negative Prompt",
-                        value="score_6, score_5, score_4, worst quality, low quality, text, deformed, bad hand, blurry, (watermark), extra hands, long ears, ugly, deformed joints, deformed hands, empty background, big ears, narrow face, glowing eyes,",
                         lines=3
                     )
                     with gr.Row():
@@ -217,7 +314,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
                 with gr.Column(scale=2):
                     prompt_neta = gr.Textbox(
                         label="Prompt",
-                        value="kita ikuyo (Bocchi the Rock!), 1girl, anime style, vibrant colors, red hair, medium hair with one side up, green eyes, bangs, hair between eyes, school uniform (white shirt, grey serafuku sailor collar, red neckerchief, pleated skirt), sitting upper body close-up, holding bouquet with white lily & pink flowers, indoors with depth of field, cherry blossom-like light particles, soft sunlight backlighting, bloom, chromatic aberration & lens flare abuse, light smile, closed mouth, one side hair up, transparent blurry foreground, warm cozy atmosphere, masterpiece, best quality",
                         lines=5
                     )
                     neg_prompt_neta = gr.Textbox(label="Negative Prompt", value="low quality, bad quality, blurry, low resolution, deformed, ugly, bad anatomy", placeholder="Enter concepts to avoid...", lines=2)
@@ -250,6 +347,34 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
                     image_output_neta = gr.Image(label="Generated Image", format="png", interactive=False)
                     used_seed_neta = gr.Number(label="Used Seed", interactive=False)
     generate_btn_newbie.click(
         fn=generate_image_newbie,
         inputs=[
@@ -263,7 +388,11 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
             cfg_trunc_newbie,
             cfg_norm_newbie,
             seed_newbie,
-            sigmas_newbie
         ],
         outputs=[image_output_newbie, used_seed_newbie]
     )
@@ -280,4 +409,5 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
         outputs=[image_output_neta, used_seed_neta]
     )
-demo.launch()

 import gradio as gr
+import os
 import spaces
 import torch
 from diffusers import AuraFlowPipeline, Lumina2Pipeline, NewbiePipeline
+from transformers import AutoModel, AutoTokenizer
 import random
 import numpy as np
+from PIL import Image
+import copy
 import warnings
+import math
+import time
+from stablepy import SCHEDULER_CONFIG_MAP, FLUX_SCHEDULE_TYPES, scheduler_names, SCHEDULE_TYPE_OPTIONS, FLUX_SCHEDULE_TYPE_OPTIONS
+from constants import BASE_PROMPT_NEWBIE, BASE_NEG_PROMPT_NEWBIE, EXAMPLES_NEWBIE, BASE_NEG_PROMPT_PONY7, BASE_PROMPT_NETA
+from pipeline_newbie_img2img import NewbieImg2ImgPipeline
+FLOW_MATCH_ONLY_MAP = {
+    k: v for k, v in SCHEDULER_CONFIG_MAP.items() if "FlowMatch" in k
+}
+FLOW_MATCH_LIST = list(FLOW_MATCH_ONLY_MAP.keys())
+SAMPLER_NEWBIE = [
+    k for k in FLOW_MATCH_ONLY_MAP.keys()
+    if k not in ["FlowMatch DPM++ SDE", "FlowMatch DPM++ 3M SDE"]
+]
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 warnings.filterwarnings("ignore")
+NEWBIE_TOKEN_LIMIT = 1100
 model_path = "Disty0/NewBie-image-Exp0.1-Diffusers"  # NewBie-AI/NewBie-image-Exp0.1
 text_encoder_2 = AutoModel.from_pretrained(model_path, subfolder="text_encoder_2", trust_remote_code=True, torch_dtype=torch.bfloat16)
 pipe_newbie = NewbiePipeline.from_pretrained(model_path, text_encoder_2=text_encoder_2, torch_dtype=torch.bfloat16)
 pipe_newbie.to("cuda")
 del text_encoder_2
+newbie_default_scheduler = copy.deepcopy(pipe_newbie.scheduler)
+pipe_newbie_img2img = NewbieImg2ImgPipeline(**pipe_newbie.components).to("cuda")
 pipe_pony = AuraFlowPipeline.from_pretrained("purplesmartai/pony-v7-base", torch_dtype=torch.bfloat16)
 pipe_pony.to("cuda")
 )
 pipe_netayume.to("cuda")
+def set_sampler(pipe, sampler_name, schedule_type, default_config):
+    if sampler_name != FLOW_MATCH_LIST[0]:
+        scheduler_class, config = FLOW_MATCH_ONLY_MAP[sampler_name]
+        pipe.scheduler = scheduler_class.from_config(default_config.config, **config)
+    flux_schedule_config = FLUX_SCHEDULE_TYPES.get(schedule_type)
+    if flux_schedule_config:
+        pipe.scheduler.register_to_config(**flux_schedule_config)
+    return pipe
+def get_newbie_token_details(prompt, system_prompt, tokenizer):
+    if prompt is None: prompt = ""
+    if system_prompt is None: system_prompt = ""
+    t_sys = tokenizer(str(system_prompt), add_special_tokens=False)["input_ids"]
+    t_sep = tokenizer(" <Prompt Start> ", add_special_tokens=False)["input_ids"]
+    t_prm = tokenizer(str(prompt), add_special_tokens=False)["input_ids"]
+    total_tokens = len(t_sys) + len(t_sep) + len(t_prm) + 2
+    if total_tokens <= 512:
+        sequence_length = 512
+    else:
+        sequence_length = math.ceil(total_tokens / 512) * 512
+    return total_tokens, sequence_length
+def check_token_count(prompt, system_prompt):
+    try:
+        time.sleep(2)
+        tokenizer = pipe_newbie.tokenizer_2
+        total, seq_len = get_newbie_token_details(prompt, system_prompt, tokenizer)
+        if total > NEWBIE_TOKEN_LIMIT:
+            return gr.update(
+                value=f"<div style='color: #ef4444; border: 1px solid #ef4444; background-color: #fef2f2; padding: 8px; border-radius: 5px; font-weight: bold; width: 100%; text-align: center;'>"
+                      f"⚠️ Token limit exceeded! ({total}/{NEWBIE_TOKEN_LIMIT}). <br>"
+                      f"Text will be truncated.</div>",
+                visible=True
+            )
+        else:
+            return gr.update(
+                value=f"<div style='color: #6b7280; font-size: 0.9em; text-align: right; width: 100%;'> {total}/{min(seq_len, NEWBIE_TOKEN_LIMIT)}</div>",
+                visible=True
+            )
+    except Exception:
+        return gr.update(visible=False)
 @spaces.GPU()
+def generate_image_newbie(prompt, negative_prompt, system_prompt, height, width, num_inference_steps, guidance_scale, cfg_trunc_ratio, cfg_normalization, seed, sigmas_factor, sampler, schedule_type, image, strength, progress=gr.Progress(track_tqdm=True)):
     if seed < 0:
         seed = random.randint(0, 2**32 - 1)
     generator = torch.Generator("cuda").manual_seed(int(seed))
+    total_tokens, seq_len = get_newbie_token_details(prompt, system_prompt, pipe_newbie.tokenizer_2)
+    if total_tokens > NEWBIE_TOKEN_LIMIT:
+        raise ValueError(f"The prompt is longer than the allowed limit of {NEWBIE_TOKEN_LIMIT} tokens.")
+    seq_len = min(seq_len, NEWBIE_TOKEN_LIMIT)
     pipeline_args = {
         "prompt": prompt,
         "negative_prompt": negative_prompt,
         "cfg_trunc_ratio": cfg_trunc_ratio,
         "cfg_normalization": cfg_normalization,
         "generator": generator,
+        "max_sequence_length": int(seq_len)
     }
     if sigmas_factor != 1.0:
         steps = int(num_inference_steps)
         sigmas = np.linspace(1.0, 1 / steps, steps)
         sigmas = sigmas * sigmas_factor
+        pipeline_args["sigmas"] = sigmas  # .tolist()
+    if image is not None:
+        pipe_task_nb = pipe_newbie_img2img
+        if isinstance(image, np.ndarray):
+            img_pil = Image.fromarray(image)
+        else:
+            img_pil = Image.open(image)
+        img_pil.thumbnail((width, height), Image.Resampling.LANCZOS)
+        pipeline_args["image"] = img_pil
+        pipeline_args["strength"] = strength
+    else:
+        pipe_task_nb = pipe_newbie
+    set_sampler(pipe_task_nb, sampler, schedule_type, newbie_default_scheduler)
+    image = pipe_task_nb(**pipeline_args).images[0]
+    pipe_task_nb.scheduler = newbie_default_scheduler
     return image, seed
 @spaces.GPU()
 def generate_image_pony(prompt, negative_prompt, height, width, num_inference_steps, guidance_scale, sigmas_factor, seed, progress=gr.Progress(track_tqdm=True)):
     if seed < 0:
     image = pipe_pony(**pipeline_args).images[0]
     return image, seed
 @spaces.GPU()
 def generate_image_netayume(prompt, negative_prompt, system_prompt, height, width, guidance_scale, num_inference_steps, cfg_trunc_ratio, cfg_normalization, seed, sigmas_factor, progress=gr.Progress(track_tqdm=True)):
     if seed < 0:
     return image, seed
 with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as demo:
     gr.Markdown("# Image Generation Playground")
     with gr.Tabs():
         with gr.Tab(label="NewBie Image"):
+            gr.Markdown("## 🐣 NewBie Image Exp0.1")
+            gr.Markdown("A 3.5B parameter experimental DiT model built on Next-DiT and Lumina insights")
             with gr.Row(variant="panel"):
                 with gr.Column(scale=2):
                     prompt_newbie = gr.Textbox(
                         label="Prompt",
+                        value=BASE_PROMPT_NEWBIE,
                         lines=3
                     )
+                    token_counter_display = gr.HTML(
+                        value="<div style='color: #6b7280; font-size: 0.9em; text-align: right;'>Token usage: Calculating...</div>",
+                        visible=True
+                    )
                     negative_prompt_newbie = gr.Textbox(
                         label="Negative Prompt",
+                        value=BASE_NEG_PROMPT_NEWBIE,
                         lines=2
                     )
                     )
                     with gr.Row():
+                        height_newbie = gr.Slider(label="Height", minimum=512, maximum=2048, step=64, value=1264)
+                        width_newbie = gr.Slider(label="Width", minimum=512, maximum=2048, step=64, value=832)
                     with gr.Row():
+                        steps_newbie = gr.Slider(label="Inference Steps", minimum=1, maximum=100, step=1, value=30)
+                        guidance_scale_newbie = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=6.5)
                     with gr.Row():
+                        sigmas_newbie = gr.Slider(label="Sigmas Factor", info="Lower values increase detail and complexity. Higher values simplify and clean the image.", minimum=0.9, maximum=1.1, step=0.001, value=0.99)
                         seed_newbie = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
+                    with gr.Accordion("More settings", open=False):
+                        with gr.Row():
+                            sampler_newbie = gr.Dropdown(label="Sampler", choices=SAMPLER_NEWBIE, value="FlowMatch DPM++ 2M SDE")
+                            schedule_type_newbie = gr.Dropdown(label="Schedule Type", choices=FLUX_SCHEDULE_TYPE_OPTIONS, value=FLUX_SCHEDULE_TYPE_OPTIONS[0])
+                        with gr.Row():
+                            cfg_norm_newbie = gr.Checkbox(label="CFG Normalization", value=True)
+                            cfg_trunc_newbie = gr.Slider(label="CFG Truncation Ratio", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
+                        with gr.Row():
+                            image_newbie = gr.Image(label="Reference image", interactive=True)
+                            strength_newbie = gr.Slider(label="Reference Image Adherence", info="Lower values = strong adherence; higher values = weak adherence.", minimum=0.1, maximum=1., step=0.01, value=0.65)
                     generate_btn_newbie = gr.Button("Generate", variant="primary")
                 with gr.Column(scale=1):
                     image_output_newbie = gr.Image(label="Generated Image", format="png", interactive=False)
                     used_seed_newbie = gr.Number(label="Used Seed", interactive=False)
+            gr.Examples(
+                examples=EXAMPLES_NEWBIE,
+                inputs=[prompt_newbie],
+                label="Example Prompts"
+            )
         with gr.Tab(label="Pony v7"):
             gr.Markdown("## ✨ Pony v7 AuraFlow")
             gr.Markdown("Generate images from text prompts using the AuraFlow model.")
                     prompt_pony = gr.Textbox(label="Prompt", value="Score_9, ", lines=3)
                     neg_prompt_pony = gr.Textbox(
                         label="Negative Prompt",
+                        value=BASE_NEG_PROMPT_PONY7,
                         lines=3
                     )
                     with gr.Row():
                 with gr.Column(scale=2):
                     prompt_neta = gr.Textbox(
                         label="Prompt",
+                        value=BASE_PROMPT_NETA,
                         lines=5
                     )
                     neg_prompt_neta = gr.Textbox(label="Negative Prompt", value="low quality, bad quality, blurry, low resolution, deformed, ugly, bad anatomy", placeholder="Enter concepts to avoid...", lines=2)
                     image_output_neta = gr.Image(label="Generated Image", format="png", interactive=False)
                     used_seed_neta = gr.Number(label="Used Seed", interactive=False)
+    prompt_newbie.change(
+        fn=check_token_count,
+        inputs=[prompt_newbie, system_prompt_newbie],
+        outputs=token_counter_display,
+        show_progress="hidden",
+        queue=False,
+        trigger_mode="always_last",
+        api_name=False
+    )
+    system_prompt_newbie.change(
+        fn=check_token_count,
+        inputs=[prompt_newbie, system_prompt_newbie],
+        outputs=token_counter_display,
+        show_progress="hidden",
+        queue=False,
+        trigger_mode="always_last",
+        api_name=False
+    )
+    # Initialize the counter on load
+    demo.load(
+        fn=check_token_count,
+        inputs=[prompt_newbie, system_prompt_newbie],
+        outputs=token_counter_display,
+        queue=False,
+        trigger_mode="always_last",
+        api_name=False
+    )
     generate_btn_newbie.click(
         fn=generate_image_newbie,
         inputs=[
             cfg_trunc_newbie,
             cfg_norm_newbie,
             seed_newbie,
+            sigmas_newbie,
+            sampler_newbie,
+            schedule_type_newbie,
+            image_newbie,
+            strength_newbie,
         ],
         outputs=[image_output_newbie, used_seed_newbie]
     )
         outputs=[image_output_neta, used_seed_neta]
     )
+if __name__ == "__main__":
+    demo.launch()

constants.py ADDED Viewed

	@@ -0,0 +1,73 @@

+BASE_PROMPT_NEWBIE = """<character_1>
+<n>original_character</n>
+<gender>1girl</gender>
+<appearance>blonde_hair, long_hair</appearance>
+<clothing>large_hat, white_hat, white_blouse, puffy_sleeves, shoulder cutout, black_skirt, shirt_tucked_in, socks, shoes</clothing>
+<expression>smile, confident</expression>
+<action>sitting, reclining, arm support, from above, female focus, close-up, dutch angle, solo</action>
+<position>center, looking_up</position>
+</character_1>
+<general_tags>
+<count>1girl, solo</count>
+<artists>ciloranko, kazutake hazano, onineko, r17329 illu, ma1ma1helmes b illu</artists>
+<style>anime_style, digital_art</style>
+<background>detailed_background, scenery, detailed_background</background>
+<atmosphere>cheerful</atmosphere>
+<lighting>dynamic_angle, depth_of_field, high_contrast, colorful, detailed_light, light_leaks, beautiful_detailed_glow, best_shadow, shiny_skin, cinematic_lighting, ray_tracing</lighting>
+<quality>HDR, 8K, masterpiece, best quality, amazing quality, very aesthetic, extreme aesthetic, detailed eyes, sharp eyes, newest, highres, absurdres, incredibly absurdres, very awa, detailed backgroud, finished, overlapping, appropriate posture, appropriate configuration, cropping, thick dense skin, ultra-precise skin, soft cheeks</quality>
+<objects>bag</objects>
+</general_tags>"""
+BASE_NEG_PROMPT_NEWBIE = """<danbooru_tags>low_score_rate, worst quality, low quality, bad quality, lowres, low res, pixelated, blurry, blurred, compression artifacts, jpeg artifacts, bad anatomy, worst hands, deformed hands, deformed fingers, deformed feet, deformed toes, extra limbs, extra arms, extra legs, extra fingers, extra digits, extra digit, fused fingers, missing limbs, missing arms, missing fingers, missing toes, wrong hands, ugly hands, ugly fingers, twisted hands, flexible deformity, conjoined, disembodied, text, watermark, signature, logo, ugly, worst, very displeasing, displeasing, error, doesnotexist, unfinished, poorly drawn face, poorly drawn hands, poorly drawn feet, artistic error, bad proportions, bad perspective, out of frame, ai-generated, ai-assisted, stable diffusion, overly saturated, overly vivid, cross-eye, expressionless, scan, sketch, monochrome, simple background, abstract, sequence, lineup, 2koma, 4koma, microsoft paint \(medium\), artifacts, adversarial noise, has bad revision, resized, image sample,low_aesthetic</danbooru_tags>"""
+EXAMPLES_NEWBIE = [
+    ["""<character_1>
+<n>original character</n>
+<gender>1girl, solo</gender>
+<appearance>beautiful female, perfect face, flawless skin, sharp features, white_hair, very_long_hair, straight_hair, double_bun, blunt_bangs, sidelocks, hair_blowing, pink_eyes, detailed eyes, shimmering eyes, glossy eyes, red_eyeshadow, small_breasts, petite, slender neck, elegant shoulders, delicate</appearance>
+<clothing>red_dress, elegant_dress, sleeveless_dress, strapless_top, bare_shoulders, intricate_dress, detailed_clothing, textured_fabric, ornate_trim, fringe_trim, jewelry, earrings, necklace, red_shawl, flowing_shawl, fabric_folds, fabric_drape</clothing>
+<expression>looking_at_viewer, looking_up, intense_gaze, captivating, mysterious, alluring, slight_smile</expression>
+<action>upper_body, close-up, hand_up, palm_up, outstretched_arm, foreshortening, wind_effect, hair_flow, clothing_flow, graceful_pose</action>
+<position>upper_body_portrait, close_up, straight_on, from_above, off-center_right</position>
+</character_1>
+<general_tags>
+<style>pro-p style, anime style, painterly, detailed painting, digital painting, masterpiece illustration, professional, artbook illustration</style>
+<background>red_spider_lilies, flower_field, bokeh, depth_of_field, blurry_background, shallow_depth, dark_background, gradient_background, abstract_background, petals_floating, wind_particles</background>
+<atmosphere>dramatic, elegant, mysterious, captivating, alluring, cinematic, stylish, high_fashion, artistic</atmosphere>
+<lighting>dramatic_lighting, rim_lighting, volumetric_lighting, god_rays, chiaroscuro, spotlight, contrast, soft_shadows, sharp_highlights, glowing_edges, lens_flare, atmospheric_light, red_lighting, warm_vs_cold_light, cinematic_lighting, studio_lighting</lighting>
+<quality>masterpiece, best_quality, extremely_detailed, ultra_detailed_cg, 8k, sharp_focus, highres, absurdres, professional, trending_on_artstation, artstation_hd, detailed_skin, detailed_hair, detailed_fabric, detailed_eyes</quality>
+<objects>spider_lilies, red_flowers, shawl, jewelry, earrings, necklace, floating_petals, wind</objects>
+<other>portrait, close-up, upper_body, detailed_portrait, highly_detailed, beautiful_and_detailed, dynamic_composition, elegant_pose, wind_dynamic, fabric_physics, solo, (intricate_design:1.2)</other>
+</general_tags>
+<caption>A breathtakingly detailed pro-p style upper-body close-up portrait of an elegant and mysterious girl. Her flawless face features sharp, captivating pink eyes with shimmering red eyeshadow and glossy lips. Her pristine white hair is styled in perfect double buns with blunt bangs, with long sidelocks and flowing strands lifted by an unseen wind. She wears an intricate, textured red strapless dress with ornate fringe trim, complemented by delicate jewelry and a gracefully flowing red shawl. Her pose is dynamic, with one arm raised and palm upturned in a foreshortened gesture, as she looks up at the viewer with an intense, alluring gaze. The dramatic, cinematic lighting employs strong rim light, chiaroscuro contrasts, and volumetric god rays, illuminating her from above against a shallow depth-of-field background of blurry red spider lilies and floating petals, creating a high-fashion, artistic masterpiece.</caption>"""],
+    ["1girl, solo, long hair, breasts, looking at viewer, blue eyes, black hair, hair ornament, dress, holding, closed mouth, jewelry, bare shoulders, upper body, braid, weapon, earrings, sleeveless, sword, white dress, holding weapon, hair bun, bracelet, grey eyes, sleeveless dress, tattoo, sideboob, holding sword, chinese clothes, tassel, sheath, china dress, red nails, side slit, beads, dragon, arm tattoo, arm strap, shoulder tattoo, bead bracelet, tassel earrings, unsheathing, dragon print, eastern dragon, year of the dragon"],
+    ["""<character_1>
+<n></n>
+<gender>1girl</gender>
+<appearance>blonde_hair,golden_hair,long_hair,twin_braids,blue_eyes,detailed_eyes,sparkling_eyes,messy_hair,bangs,long_eyelashes,fair_skin,beautiful_face,makeup,nail_polish,white_nails,ring,jewelry</appearance>
+<clothing>maid_headdress,white_frilled_headband,white_dress,frills,ribbon,heart_ornament,gemstone_hair_ornament</clothing>
+<expression>open_mouth,hand_covering_mouth,fingers_on_face,shy,surprised,blush,looking_at_viewer</expression>
+<action>lying_on_back,hands_near_face,face_framing</action>
+<interaction></interaction>
+<position>center,close-up,upper_body</position>
+</character_1>
+<general_tags>
+<count>1girl</count>
+<artists>artist:mika_pikazo</artists>
+<style>anime,vivid_colors,extremely_detailed,colorful,high_contrast,glossy</style>
+<background>light_blue_background,abstract_background</background>
+<environment>surrounded_by_fruit,food_theme,sweets_theme</environment>
+<perspective>from_above,high_angle,looking_down</perspective>
+<atmosphere>cheerful,vibrant,shiny,energetic</atmosphere>
+<lighting>studio_lighting,bright_light,glossy_bouncing_light</lighting>
+<resolution>max_high_resolution</resolution>
+<quality>masterpiece,best_quality,absurdres</quality>
+<objects>strawberries,fruit,whipped_cream,cake,glass_shards,crystal,splashing,heart_gem,floating_objects</objects>
+<other></other>
+</general_tags>"""],
+]
+BASE_NEG_PROMPT_PONY7 = "score_6, score_5, score_4, worst quality, low quality, text, deformed, bad hand, blurry, (watermark), extra hands, long ears, ugly, deformed joints, deformed hands, empty background, big ears, narrow face, glowing eyes,"
+BASE_PROMPT_NETA = "kita ikuyo (Bocchi the Rock!), 1girl, anime style, vibrant colors, red hair, medium hair with one side up, green eyes, bangs, hair between eyes, school uniform (white shirt, grey serafuku sailor collar, red neckerchief, pleated skirt), sitting upper body close-up, holding bouquet with white lily & pink flowers, indoors with depth of field, cherry blossom-like light particles, soft sunlight backlighting, bloom, chromatic aberration & lens flare abuse, light smile, closed mouth, one side hair up, transparent blurry foreground, warm cozy atmosphere, masterpiece, best quality"

pipeline_newbie_img2img.py ADDED Viewed

	@@ -0,0 +1,563 @@

+# Copyright 2025 Alpha-VLLM and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from transformers import (
+    PreTrainedModel,
+    Gemma3PreTrainedModel,
+    GemmaTokenizer,
+    GemmaTokenizerFast,
+    XLMRobertaTokenizer,
+    XLMRobertaTokenizerFast
+)
+from diffusers.pipelines.pipeline_utils import ImagePipelineOutput
+from diffusers.image_processor import PipelineImageInput
+from diffusers.pipelines.newbie.pipeline_newbie import NewbiePipeline
+from diffusers.models import AutoencoderKL
+from diffusers.models.transformers.transformer_lumina2 import Lumina2Transformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import NewbieImg2ImgPipeline
+        >>> from diffusers.utils import load_image
+        >>> from transformers import AutoModel
+        >>> device = "cuda"
+        >>> model_path = "Disty0/NewBie-image-Exp0.1-Diffusers"
+        >>> text_encoder_2 = AutoModel.from_pretrained(model_path, subfolder="text_encoder_2", trust_remote_code=True, torch_dtype=torch.bfloat16)
+        >>> pipe = NewbieImg2ImgPipeline.from_pretrained(model_path, text_encoder_2=text_encoder_2, torch_dtype=torch.bfloat16)
+        >>> pipe.enable_model_cpu_offload(device=device)
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> init_image = load_image(url).resize((1024, 1024))
+        >>> prompt = "A fantasy landscape with mountains and a river, detailed, vibrant colors, anime style"
+        >>> negative_prompt = "low quality, worst quality, blurry"
+        >>> image = pipe(
+        >>>     prompt,
+        >>>     image=init_image,
+        >>>     strength=0.6,
+        >>>     negative_prompt=negative_prompt,
+        >>>     guidance_scale=2.5,
+        >>>     num_inference_steps=30,
+        >>>     generator=torch.manual_seed(42),
+        >>> ).images[0]
+        ```
+"""
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class NewbieImg2ImgPipeline(NewbiePipeline):
+    r"""
+    Pipeline for image-to-image generation using Lumina-T2I / Newbie model.
+    This model inherits from [`NewbiePipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`Gemma3PreTrainedModel`]):
+            Frozen Gemma3 text-encoder.
+        text_encoder_2 ([`PreTrainedModel`]):
+            Frozen JinaCLIPTextModel text-encoder. Requires `trust_remote_code=True`.
+        tokenizer (`GemmaTokenizer` or `GemmaTokenizerFast`):
+            Gemma tokenizer.
+        tokenizer_2 (`XLMRobertaTokenizer` or `XLMRobertaTokenizerFast`):
+            XLMRoberta tokenizer.
+        transformer ([`Transformer2DModel`]):
+            A text conditioned `Transformer2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+        return timesteps, num_inference_steps - t_start
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        # 1. Encode the input image
+        image = image.to(device=device, dtype=dtype)
+        if image.shape[1] == num_channels_latents:
+            image_latents = image
+        else:
+            if isinstance(generator, list):
+                image_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(image.shape[0])
+                ]
+                image_latents = torch.cat(image_latents, dim=0)
+            else:
+                image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+            # Apply scaling
+            image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        # 2. Handle batch size expansion for num_images_per_prompt
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        # 3. Add noise to latents
+        shape = image_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        return latents
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.6,
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+        num_inference_steps: int = 30,
+        guidance_scale: float = 4.0,
+        negative_prompt: Union[str, List[str]] = None,
+        sigmas: List[float] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        system_prompt: Optional[str] = None,
+        cfg_trunc_ratio: float = 1.0,
+        cfg_normalization: bool = True,
+        max_sequence_length: int = 512,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for image-to-image generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point.
+            strength (`float`, *optional*, defaults to 0.6):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 30):
+                The number of denoising steps.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image. If not provided, it is inferred from input image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image. If not provided, it is inferred from input image.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings.
+            prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            attention_kwargs:
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function.
+            system_prompt (`str`, *optional*):
+                The system prompt to use for the image generation.
+            cfg_trunc_ratio (`float`, *optional*, defaults to `1.0`):
+                The ratio of the timestep interval to apply normalization-based guidance scale.
+            cfg_normalization (`bool`, *optional*, defaults to `True`):
+                Whether to apply normalization-based guidance scale.
+            max_sequence_length (`int`, defaults to `512`):
+                Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        # 1. Check strength
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should be in [0.0, 1.0] but is {strength}")
+        # 2. Preprocess image
+        init_image = self.image_processor.preprocess(image)
+        init_image = init_image.to(dtype=torch.float32)
+        # Get dimensions from image if not specified
+        if height is None:
+            height = init_image.shape[-2]
+        if width is None:
+            width = init_image.shape[-1]
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        # 3. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            max_sequence_length=max_sequence_length,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+        # 4. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 5. Encode input prompt
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            max_sequence_length=max_sequence_length,
+            system_prompt=system_prompt,
+        )
+        # 6. Prepare timesteps
+        full_sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        latent_height = height // (self.vae_scale_factor * 2) * 2
+        latent_width = width // (self.vae_scale_factor * 2) * 2
+        image_seq_len = (latent_height // 2) * (latent_width // 2)
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=full_sigmas,
+            mu=mu,
+        )
+        # 7. Adjust timesteps based on strength
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline "
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # 8. Prepare latents
+        latents = self.prepare_latents(
+            init_image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            self.transformer.config.in_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # 9. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # compute whether apply classifier-free truncation on this timestep
+                do_classifier_free_truncation = (i + 1) / num_inference_steps > cfg_trunc_ratio
+                # reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
+                current_timestep = 1 - t / self.scheduler.config.num_train_timesteps
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latents.shape[0])
+                noise_pred_cond = self.transformer(
+                    hidden_states=latents,
+                    timestep=current_timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_attention_mask=prompt_attention_mask,
+                    return_dict=False,
+                    attention_kwargs=self.attention_kwargs,
+                )[0]
+                # perform normalization-based guidance scale on a truncated timestep interval
+                if self.do_classifier_free_guidance and not do_classifier_free_truncation:
+                    noise_pred_uncond = self.transformer(
+                        hidden_states=latents,
+                        timestep=current_timestep,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        pooled_projections=negative_pooled_prompt_embeds,
+                        encoder_attention_mask=negative_prompt_attention_mask,
+                        return_dict=False,
+                        attention_kwargs=self.attention_kwargs,
+                    )[0]
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    # apply normalization after classifier-free guidance
+                    if cfg_normalization:
+                        cond_norm = torch.norm(noise_pred_cond, dim=-1, keepdim=True)
+                        noise_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+                        noise_pred = noise_pred * (cond_norm / noise_norm)
+                else:
+                    noise_pred = noise_pred_cond
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                noise_pred = -noise_pred
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    pooled_prompt_embeds = callback_outputs.pop("pooled_prompt_embeds", pooled_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if not output_type == "latent":
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        else:
+            image = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return ImagePipelineOutput(images=image)

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ stablepy==0.6.5

requirements.txt CHANGED Viewed

@@ -6,4 +6,7 @@ accelerate
 timm
 torchvision
 einops
-kernels

 timm
 torchvision
 einops
+torchao==0.11.0
+kernels
+torchsde>=0.2.6
+accelerate==1.12.0