r3gm commited on
Commit
406ea4d
·
verified ·
1 Parent(s): 09f795e

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +177 -47
  2. constants.py +73 -0
  3. pipeline_newbie_img2img.py +563 -0
  4. pre-requirements.txt +1 -0
  5. requirements.txt +4 -1
app.py CHANGED
@@ -1,20 +1,41 @@
1
  import gradio as gr
 
2
  import spaces
3
  import torch
4
  from diffusers import AuraFlowPipeline, Lumina2Pipeline, NewbiePipeline
5
- from transformers import AutoModel
6
  import random
7
  import numpy as np
 
 
8
  import warnings
9
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  warnings.filterwarnings("ignore")
 
11
 
12
  model_path = "Disty0/NewBie-image-Exp0.1-Diffusers" # NewBie-AI/NewBie-image-Exp0.1
13
  text_encoder_2 = AutoModel.from_pretrained(model_path, subfolder="text_encoder_2", trust_remote_code=True, torch_dtype=torch.bfloat16)
14
  pipe_newbie = NewbiePipeline.from_pretrained(model_path, text_encoder_2=text_encoder_2, torch_dtype=torch.bfloat16)
15
  pipe_newbie.to("cuda")
16
  del text_encoder_2
17
- pipe_newbie.transformer.set_attention_backend("_flash_3_hub")
 
18
 
19
  pipe_pony = AuraFlowPipeline.from_pretrained("purplesmartai/pony-v7-base", torch_dtype=torch.bfloat16)
20
  pipe_pony.to("cuda")
@@ -25,13 +46,73 @@ pipe_netayume = Lumina2Pipeline.from_pretrained(
25
  )
26
  pipe_netayume.to("cuda")
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  @spaces.GPU()
29
- def generate_image_newbie(prompt, negative_prompt, system_prompt, height, width, num_inference_steps, guidance_scale, cfg_trunc_ratio, cfg_normalization, seed, sigmas_factor, progress=gr.Progress(track_tqdm=True)):
30
  if seed < 0:
31
  seed = random.randint(0, 2**32 - 1)
32
 
33
  generator = torch.Generator("cuda").manual_seed(int(seed))
34
 
 
 
 
 
 
35
  pipeline_args = {
36
  "prompt": prompt,
37
  "negative_prompt": negative_prompt,
@@ -43,18 +124,35 @@ def generate_image_newbie(prompt, negative_prompt, system_prompt, height, width,
43
  "cfg_trunc_ratio": cfg_trunc_ratio,
44
  "cfg_normalization": cfg_normalization,
45
  "generator": generator,
 
46
  }
47
 
48
  if sigmas_factor != 1.0:
49
  steps = int(num_inference_steps)
50
  sigmas = np.linspace(1.0, 1 / steps, steps)
51
  sigmas = sigmas * sigmas_factor
52
- pipeline_args["sigmas"] = sigmas.tolist()
53
-
54
- image = pipe_newbie(**pipeline_args).images[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  return image, seed
57
 
 
58
  @spaces.GPU()
59
  def generate_image_pony(prompt, negative_prompt, height, width, num_inference_steps, guidance_scale, sigmas_factor, seed, progress=gr.Progress(track_tqdm=True)):
60
  if seed < 0:
@@ -81,6 +179,7 @@ def generate_image_pony(prompt, negative_prompt, height, width, num_inference_st
81
  image = pipe_pony(**pipeline_args).images[0]
82
  return image, seed
83
 
 
84
  @spaces.GPU()
85
  def generate_image_netayume(prompt, negative_prompt, system_prompt, height, width, guidance_scale, num_inference_steps, cfg_trunc_ratio, cfg_normalization, seed, sigmas_factor, progress=gr.Progress(track_tqdm=True)):
86
  if seed < 0:
@@ -111,43 +210,29 @@ def generate_image_netayume(prompt, negative_prompt, system_prompt, height, widt
111
 
112
  return image, seed
113
 
114
- newbie_prompt = """<character_1>
115
- <n>$character_1$</n>
116
- <gender>1girl, solo</gender>
117
- <appearance>blonde_hair, long_hair</appearance>
118
- <clothing>large_hat, white_hat, white_blouse, puffy_sleeves, shoulder_cutout, black_skirt, shirt_tucked_in, socks, shoes</clothing>
119
- <expression>looking_up</expression>
120
- <action>sitting, reclining, arm_support, from_side, cowboy_shot, wide_shot</action>
121
- <position>center</position>
122
- </character_1>
123
-
124
- <general_tags>
125
- <count>1girl</count>
126
- <artists>(kazutake hazano:0.5), (onineko:0.8), (r17329 illu:0.2), (ma1ma1helmes b illu:0.2)</artists>
127
- <style>masterpiece, best_quality, high_resolution, detailed</style>
128
- <background>detailed_background, scenery, detailed_background</background>
129
- <atmosphere>cheerful</atmosphere>
130
- <lighting>dynamic_angle, depth_of_field, high_contrast, colorful, detailed_light, light_leaks, beautiful_detailed_glow, best_shadow, shiny_skin, cinematic_lighting, ray_tracing, from_above, female_focus, close-up, dutch_angle, blue_archive</lighting>
131
- <quality>very_aesthetic, masterpiece, no_text</quality>
132
- <objects>bag</objects>
133
- <other>2024_year</other>
134
- </general_tags>"""
135
 
136
  with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as demo:
137
  gr.Markdown("# Image Generation Playground")
138
  with gr.Tabs():
139
  with gr.Tab(label="NewBie Image"):
140
- gr.Markdown("## 🆕 NewBie Image Generation")
 
141
  with gr.Row(variant="panel"):
142
  with gr.Column(scale=2):
143
  prompt_newbie = gr.Textbox(
144
  label="Prompt",
145
- value=newbie_prompt,
146
  lines=3
147
  )
 
 
 
 
 
 
148
  negative_prompt_newbie = gr.Textbox(
149
  label="Negative Prompt",
150
- value="low quality, bad quality, blurry, low resolution, deformed, ugly, bad anatomy",
151
  lines=2
152
  )
153
 
@@ -162,27 +247,39 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
162
  )
163
 
164
  with gr.Row():
165
- height_newbie = gr.Slider(label="Height", minimum=512, maximum=2048, step=64, value=1024)
166
- width_newbie = gr.Slider(label="Width", minimum=512, maximum=2048, step=64, value=1024)
167
-
168
  with gr.Row():
169
- steps_newbie = gr.Slider(label="Inference Steps", minimum=1, maximum=100, step=1, value=40)
170
- guidance_scale_newbie = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=5.0)
171
-
172
- with gr.Row():
173
- cfg_trunc_newbie = gr.Slider(label="CFG Truncation Ratio", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
174
- sigmas_newbie = gr.Slider(label="Sigmas Factor", minimum=0.9, maximum=1.1, step=0.01, value=0.98)
175
-
176
  with gr.Row():
177
- cfg_norm_newbie = gr.Checkbox(label="CFG Normalization", value=True)
178
  seed_newbie = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
179
-
 
 
 
 
 
 
 
 
 
 
 
 
180
  generate_btn_newbie = gr.Button("Generate", variant="primary")
181
 
182
  with gr.Column(scale=1):
183
  image_output_newbie = gr.Image(label="Generated Image", format="png", interactive=False)
184
  used_seed_newbie = gr.Number(label="Used Seed", interactive=False)
185
 
 
 
 
 
 
 
186
  with gr.Tab(label="Pony v7"):
187
  gr.Markdown("## ✨ Pony v7 AuraFlow")
188
  gr.Markdown("Generate images from text prompts using the AuraFlow model.")
@@ -191,7 +288,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
191
  prompt_pony = gr.Textbox(label="Prompt", value="Score_9, ", lines=3)
192
  neg_prompt_pony = gr.Textbox(
193
  label="Negative Prompt",
194
- value="score_6, score_5, score_4, worst quality, low quality, text, deformed, bad hand, blurry, (watermark), extra hands, long ears, ugly, deformed joints, deformed hands, empty background, big ears, narrow face, glowing eyes,",
195
  lines=3
196
  )
197
  with gr.Row():
@@ -217,7 +314,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
217
  with gr.Column(scale=2):
218
  prompt_neta = gr.Textbox(
219
  label="Prompt",
220
- value="kita ikuyo (Bocchi the Rock!), 1girl, anime style, vibrant colors, red hair, medium hair with one side up, green eyes, bangs, hair between eyes, school uniform (white shirt, grey serafuku sailor collar, red neckerchief, pleated skirt), sitting upper body close-up, holding bouquet with white lily & pink flowers, indoors with depth of field, cherry blossom-like light particles, soft sunlight backlighting, bloom, chromatic aberration & lens flare abuse, light smile, closed mouth, one side hair up, transparent blurry foreground, warm cozy atmosphere, masterpiece, best quality",
221
  lines=5
222
  )
223
  neg_prompt_neta = gr.Textbox(label="Negative Prompt", value="low quality, bad quality, blurry, low resolution, deformed, ugly, bad anatomy", placeholder="Enter concepts to avoid...", lines=2)
@@ -250,6 +347,34 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
250
  image_output_neta = gr.Image(label="Generated Image", format="png", interactive=False)
251
  used_seed_neta = gr.Number(label="Used Seed", interactive=False)
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  generate_btn_newbie.click(
254
  fn=generate_image_newbie,
255
  inputs=[
@@ -263,7 +388,11 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
263
  cfg_trunc_newbie,
264
  cfg_norm_newbie,
265
  seed_newbie,
266
- sigmas_newbie
 
 
 
 
267
  ],
268
  outputs=[image_output_newbie, used_seed_newbie]
269
  )
@@ -280,4 +409,5 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as d
280
  outputs=[image_output_neta, used_seed_neta]
281
  )
282
 
283
- demo.launch()
 
 
1
  import gradio as gr
2
+ import os
3
  import spaces
4
  import torch
5
  from diffusers import AuraFlowPipeline, Lumina2Pipeline, NewbiePipeline
6
+ from transformers import AutoModel, AutoTokenizer
7
  import random
8
  import numpy as np
9
+ from PIL import Image
10
+ import copy
11
  import warnings
12
+ import math
13
+ import time
14
+ from stablepy import SCHEDULER_CONFIG_MAP, FLUX_SCHEDULE_TYPES, scheduler_names, SCHEDULE_TYPE_OPTIONS, FLUX_SCHEDULE_TYPE_OPTIONS
15
+
16
+ from constants import BASE_PROMPT_NEWBIE, BASE_NEG_PROMPT_NEWBIE, EXAMPLES_NEWBIE, BASE_NEG_PROMPT_PONY7, BASE_PROMPT_NETA
17
+ from pipeline_newbie_img2img import NewbieImg2ImgPipeline
18
+
19
+ FLOW_MATCH_ONLY_MAP = {
20
+ k: v for k, v in SCHEDULER_CONFIG_MAP.items() if "FlowMatch" in k
21
+ }
22
+ FLOW_MATCH_LIST = list(FLOW_MATCH_ONLY_MAP.keys())
23
+ SAMPLER_NEWBIE = [
24
+ k for k in FLOW_MATCH_ONLY_MAP.keys()
25
+ if k not in ["FlowMatch DPM++ SDE", "FlowMatch DPM++ 3M SDE"]
26
+ ]
27
+
28
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
29
  warnings.filterwarnings("ignore")
30
+ NEWBIE_TOKEN_LIMIT = 1100
31
 
32
  model_path = "Disty0/NewBie-image-Exp0.1-Diffusers" # NewBie-AI/NewBie-image-Exp0.1
33
  text_encoder_2 = AutoModel.from_pretrained(model_path, subfolder="text_encoder_2", trust_remote_code=True, torch_dtype=torch.bfloat16)
34
  pipe_newbie = NewbiePipeline.from_pretrained(model_path, text_encoder_2=text_encoder_2, torch_dtype=torch.bfloat16)
35
  pipe_newbie.to("cuda")
36
  del text_encoder_2
37
+ newbie_default_scheduler = copy.deepcopy(pipe_newbie.scheduler)
38
+ pipe_newbie_img2img = NewbieImg2ImgPipeline(**pipe_newbie.components).to("cuda")
39
 
40
  pipe_pony = AuraFlowPipeline.from_pretrained("purplesmartai/pony-v7-base", torch_dtype=torch.bfloat16)
41
  pipe_pony.to("cuda")
 
46
  )
47
  pipe_netayume.to("cuda")
48
 
49
+
50
+ def set_sampler(pipe, sampler_name, schedule_type, default_config):
51
+ if sampler_name != FLOW_MATCH_LIST[0]:
52
+ scheduler_class, config = FLOW_MATCH_ONLY_MAP[sampler_name]
53
+ pipe.scheduler = scheduler_class.from_config(default_config.config, **config)
54
+
55
+ flux_schedule_config = FLUX_SCHEDULE_TYPES.get(schedule_type)
56
+
57
+ if flux_schedule_config:
58
+ pipe.scheduler.register_to_config(**flux_schedule_config)
59
+
60
+ return pipe
61
+
62
+
63
+ def get_newbie_token_details(prompt, system_prompt, tokenizer):
64
+ if prompt is None: prompt = ""
65
+ if system_prompt is None: system_prompt = ""
66
+
67
+ t_sys = tokenizer(str(system_prompt), add_special_tokens=False)["input_ids"]
68
+ t_sep = tokenizer(" <Prompt Start> ", add_special_tokens=False)["input_ids"]
69
+ t_prm = tokenizer(str(prompt), add_special_tokens=False)["input_ids"]
70
+
71
+ total_tokens = len(t_sys) + len(t_sep) + len(t_prm) + 2
72
+
73
+ if total_tokens <= 512:
74
+ sequence_length = 512
75
+ else:
76
+ sequence_length = math.ceil(total_tokens / 512) * 512
77
+
78
+ return total_tokens, sequence_length
79
+
80
+
81
+ def check_token_count(prompt, system_prompt):
82
+ try:
83
+ time.sleep(2)
84
+
85
+ tokenizer = pipe_newbie.tokenizer_2
86
+ total, seq_len = get_newbie_token_details(prompt, system_prompt, tokenizer)
87
+
88
+ if total > NEWBIE_TOKEN_LIMIT:
89
+ return gr.update(
90
+ value=f"<div style='color: #ef4444; border: 1px solid #ef4444; background-color: #fef2f2; padding: 8px; border-radius: 5px; font-weight: bold; width: 100%; text-align: center;'>"
91
+ f"⚠️ Token limit exceeded! ({total}/{NEWBIE_TOKEN_LIMIT}). <br>"
92
+ f"Text will be truncated.</div>",
93
+ visible=True
94
+ )
95
+ else:
96
+ return gr.update(
97
+ value=f"<div style='color: #6b7280; font-size: 0.9em; text-align: right; width: 100%;'> {total}/{min(seq_len, NEWBIE_TOKEN_LIMIT)}</div>",
98
+ visible=True
99
+ )
100
+ except Exception:
101
+ return gr.update(visible=False)
102
+
103
+
104
  @spaces.GPU()
105
+ def generate_image_newbie(prompt, negative_prompt, system_prompt, height, width, num_inference_steps, guidance_scale, cfg_trunc_ratio, cfg_normalization, seed, sigmas_factor, sampler, schedule_type, image, strength, progress=gr.Progress(track_tqdm=True)):
106
  if seed < 0:
107
  seed = random.randint(0, 2**32 - 1)
108
 
109
  generator = torch.Generator("cuda").manual_seed(int(seed))
110
 
111
+ total_tokens, seq_len = get_newbie_token_details(prompt, system_prompt, pipe_newbie.tokenizer_2)
112
+ if total_tokens > NEWBIE_TOKEN_LIMIT:
113
+ raise ValueError(f"The prompt is longer than the allowed limit of {NEWBIE_TOKEN_LIMIT} tokens.")
114
+ seq_len = min(seq_len, NEWBIE_TOKEN_LIMIT)
115
+
116
  pipeline_args = {
117
  "prompt": prompt,
118
  "negative_prompt": negative_prompt,
 
124
  "cfg_trunc_ratio": cfg_trunc_ratio,
125
  "cfg_normalization": cfg_normalization,
126
  "generator": generator,
127
+ "max_sequence_length": int(seq_len)
128
  }
129
 
130
  if sigmas_factor != 1.0:
131
  steps = int(num_inference_steps)
132
  sigmas = np.linspace(1.0, 1 / steps, steps)
133
  sigmas = sigmas * sigmas_factor
134
+ pipeline_args["sigmas"] = sigmas # .tolist()
135
+
136
+ if image is not None:
137
+ pipe_task_nb = pipe_newbie_img2img
138
+ if isinstance(image, np.ndarray):
139
+ img_pil = Image.fromarray(image)
140
+ else:
141
+ img_pil = Image.open(image)
142
+ img_pil.thumbnail((width, height), Image.Resampling.LANCZOS)
143
+ pipeline_args["image"] = img_pil
144
+ pipeline_args["strength"] = strength
145
+ else:
146
+ pipe_task_nb = pipe_newbie
147
+
148
+ set_sampler(pipe_task_nb, sampler, schedule_type, newbie_default_scheduler)
149
+
150
+ image = pipe_task_nb(**pipeline_args).images[0]
151
+ pipe_task_nb.scheduler = newbie_default_scheduler
152
 
153
  return image, seed
154
 
155
+
156
  @spaces.GPU()
157
  def generate_image_pony(prompt, negative_prompt, height, width, num_inference_steps, guidance_scale, sigmas_factor, seed, progress=gr.Progress(track_tqdm=True)):
158
  if seed < 0:
 
179
  image = pipe_pony(**pipeline_args).images[0]
180
  return image, seed
181
 
182
+
183
  @spaces.GPU()
184
  def generate_image_netayume(prompt, negative_prompt, system_prompt, height, width, guidance_scale, num_inference_steps, cfg_trunc_ratio, cfg_normalization, seed, sigmas_factor, progress=gr.Progress(track_tqdm=True)):
185
  if seed < 0:
 
210
 
211
  return image, seed
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
  with gr.Blocks(theme=gr.themes.Soft(), title="Image Generation Playground") as demo:
215
  gr.Markdown("# Image Generation Playground")
216
  with gr.Tabs():
217
  with gr.Tab(label="NewBie Image"):
218
+ gr.Markdown("## 🐣 NewBie Image Exp0.1")
219
+ gr.Markdown("A 3.5B parameter experimental DiT model built on Next-DiT and Lumina insights")
220
  with gr.Row(variant="panel"):
221
  with gr.Column(scale=2):
222
  prompt_newbie = gr.Textbox(
223
  label="Prompt",
224
+ value=BASE_PROMPT_NEWBIE,
225
  lines=3
226
  )
227
+
228
+ token_counter_display = gr.HTML(
229
+ value="<div style='color: #6b7280; font-size: 0.9em; text-align: right;'>Token usage: Calculating...</div>",
230
+ visible=True
231
+ )
232
+
233
  negative_prompt_newbie = gr.Textbox(
234
  label="Negative Prompt",
235
+ value=BASE_NEG_PROMPT_NEWBIE,
236
  lines=2
237
  )
238
 
 
247
  )
248
 
249
  with gr.Row():
250
+ height_newbie = gr.Slider(label="Height", minimum=512, maximum=2048, step=64, value=1264)
251
+ width_newbie = gr.Slider(label="Width", minimum=512, maximum=2048, step=64, value=832)
 
252
  with gr.Row():
253
+ steps_newbie = gr.Slider(label="Inference Steps", minimum=1, maximum=100, step=1, value=30)
254
+ guidance_scale_newbie = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=6.5)
 
 
 
 
 
255
  with gr.Row():
256
+ sigmas_newbie = gr.Slider(label="Sigmas Factor", info="Lower values increase detail and complexity. Higher values simplify and clean the image.", minimum=0.9, maximum=1.1, step=0.001, value=0.99)
257
  seed_newbie = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
258
+
259
+ with gr.Accordion("More settings", open=False):
260
+ with gr.Row():
261
+ sampler_newbie = gr.Dropdown(label="Sampler", choices=SAMPLER_NEWBIE, value="FlowMatch DPM++ 2M SDE")
262
+ schedule_type_newbie = gr.Dropdown(label="Schedule Type", choices=FLUX_SCHEDULE_TYPE_OPTIONS, value=FLUX_SCHEDULE_TYPE_OPTIONS[0])
263
+ with gr.Row():
264
+ cfg_norm_newbie = gr.Checkbox(label="CFG Normalization", value=True)
265
+ cfg_trunc_newbie = gr.Slider(label="CFG Truncation Ratio", minimum=0.0, maximum=1.0, step=0.05, value=1.0)
266
+
267
+ with gr.Row():
268
+ image_newbie = gr.Image(label="Reference image", interactive=True)
269
+ strength_newbie = gr.Slider(label="Reference Image Adherence", info="Lower values = strong adherence; higher values = weak adherence.", minimum=0.1, maximum=1., step=0.01, value=0.65)
270
+
271
  generate_btn_newbie = gr.Button("Generate", variant="primary")
272
 
273
  with gr.Column(scale=1):
274
  image_output_newbie = gr.Image(label="Generated Image", format="png", interactive=False)
275
  used_seed_newbie = gr.Number(label="Used Seed", interactive=False)
276
 
277
+ gr.Examples(
278
+ examples=EXAMPLES_NEWBIE,
279
+ inputs=[prompt_newbie],
280
+ label="Example Prompts"
281
+ )
282
+
283
  with gr.Tab(label="Pony v7"):
284
  gr.Markdown("## ✨ Pony v7 AuraFlow")
285
  gr.Markdown("Generate images from text prompts using the AuraFlow model.")
 
288
  prompt_pony = gr.Textbox(label="Prompt", value="Score_9, ", lines=3)
289
  neg_prompt_pony = gr.Textbox(
290
  label="Negative Prompt",
291
+ value=BASE_NEG_PROMPT_PONY7,
292
  lines=3
293
  )
294
  with gr.Row():
 
314
  with gr.Column(scale=2):
315
  prompt_neta = gr.Textbox(
316
  label="Prompt",
317
+ value=BASE_PROMPT_NETA,
318
  lines=5
319
  )
320
  neg_prompt_neta = gr.Textbox(label="Negative Prompt", value="low quality, bad quality, blurry, low resolution, deformed, ugly, bad anatomy", placeholder="Enter concepts to avoid...", lines=2)
 
347
  image_output_neta = gr.Image(label="Generated Image", format="png", interactive=False)
348
  used_seed_neta = gr.Number(label="Used Seed", interactive=False)
349
 
350
+ prompt_newbie.change(
351
+ fn=check_token_count,
352
+ inputs=[prompt_newbie, system_prompt_newbie],
353
+ outputs=token_counter_display,
354
+ show_progress="hidden",
355
+ queue=False,
356
+ trigger_mode="always_last",
357
+ api_name=False
358
+ )
359
+ system_prompt_newbie.change(
360
+ fn=check_token_count,
361
+ inputs=[prompt_newbie, system_prompt_newbie],
362
+ outputs=token_counter_display,
363
+ show_progress="hidden",
364
+ queue=False,
365
+ trigger_mode="always_last",
366
+ api_name=False
367
+ )
368
+ # Initialize the counter on load
369
+ demo.load(
370
+ fn=check_token_count,
371
+ inputs=[prompt_newbie, system_prompt_newbie],
372
+ outputs=token_counter_display,
373
+ queue=False,
374
+ trigger_mode="always_last",
375
+ api_name=False
376
+ )
377
+
378
  generate_btn_newbie.click(
379
  fn=generate_image_newbie,
380
  inputs=[
 
388
  cfg_trunc_newbie,
389
  cfg_norm_newbie,
390
  seed_newbie,
391
+ sigmas_newbie,
392
+ sampler_newbie,
393
+ schedule_type_newbie,
394
+ image_newbie,
395
+ strength_newbie,
396
  ],
397
  outputs=[image_output_newbie, used_seed_newbie]
398
  )
 
409
  outputs=[image_output_neta, used_seed_neta]
410
  )
411
 
412
+ if __name__ == "__main__":
413
+ demo.launch()
constants.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BASE_PROMPT_NEWBIE = """<character_1>
2
+ <n>original_character</n>
3
+ <gender>1girl</gender>
4
+ <appearance>blonde_hair, long_hair</appearance>
5
+ <clothing>large_hat, white_hat, white_blouse, puffy_sleeves, shoulder cutout, black_skirt, shirt_tucked_in, socks, shoes</clothing>
6
+ <expression>smile, confident</expression>
7
+ <action>sitting, reclining, arm support, from above, female focus, close-up, dutch angle, solo</action>
8
+ <position>center, looking_up</position>
9
+ </character_1>
10
+
11
+ <general_tags>
12
+ <count>1girl, solo</count>
13
+ <artists>ciloranko, kazutake hazano, onineko, r17329 illu, ma1ma1helmes b illu</artists>
14
+ <style>anime_style, digital_art</style>
15
+ <background>detailed_background, scenery, detailed_background</background>
16
+ <atmosphere>cheerful</atmosphere>
17
+ <lighting>dynamic_angle, depth_of_field, high_contrast, colorful, detailed_light, light_leaks, beautiful_detailed_glow, best_shadow, shiny_skin, cinematic_lighting, ray_tracing</lighting>
18
+ <quality>HDR, 8K, masterpiece, best quality, amazing quality, very aesthetic, extreme aesthetic, detailed eyes, sharp eyes, newest, highres, absurdres, incredibly absurdres, very awa, detailed backgroud, finished, overlapping, appropriate posture, appropriate configuration, cropping, thick dense skin, ultra-precise skin, soft cheeks</quality>
19
+ <objects>bag</objects>
20
+ </general_tags>"""
21
+
22
+ BASE_NEG_PROMPT_NEWBIE = """<danbooru_tags>low_score_rate, worst quality, low quality, bad quality, lowres, low res, pixelated, blurry, blurred, compression artifacts, jpeg artifacts, bad anatomy, worst hands, deformed hands, deformed fingers, deformed feet, deformed toes, extra limbs, extra arms, extra legs, extra fingers, extra digits, extra digit, fused fingers, missing limbs, missing arms, missing fingers, missing toes, wrong hands, ugly hands, ugly fingers, twisted hands, flexible deformity, conjoined, disembodied, text, watermark, signature, logo, ugly, worst, very displeasing, displeasing, error, doesnotexist, unfinished, poorly drawn face, poorly drawn hands, poorly drawn feet, artistic error, bad proportions, bad perspective, out of frame, ai-generated, ai-assisted, stable diffusion, overly saturated, overly vivid, cross-eye, expressionless, scan, sketch, monochrome, simple background, abstract, sequence, lineup, 2koma, 4koma, microsoft paint \(medium\), artifacts, adversarial noise, has bad revision, resized, image sample,low_aesthetic</danbooru_tags>"""
23
+
24
+ EXAMPLES_NEWBIE = [
25
+ ["""<character_1>
26
+ <n>original character</n>
27
+ <gender>1girl, solo</gender>
28
+ <appearance>beautiful female, perfect face, flawless skin, sharp features, white_hair, very_long_hair, straight_hair, double_bun, blunt_bangs, sidelocks, hair_blowing, pink_eyes, detailed eyes, shimmering eyes, glossy eyes, red_eyeshadow, small_breasts, petite, slender neck, elegant shoulders, delicate</appearance>
29
+ <clothing>red_dress, elegant_dress, sleeveless_dress, strapless_top, bare_shoulders, intricate_dress, detailed_clothing, textured_fabric, ornate_trim, fringe_trim, jewelry, earrings, necklace, red_shawl, flowing_shawl, fabric_folds, fabric_drape</clothing>
30
+ <expression>looking_at_viewer, looking_up, intense_gaze, captivating, mysterious, alluring, slight_smile</expression>
31
+ <action>upper_body, close-up, hand_up, palm_up, outstretched_arm, foreshortening, wind_effect, hair_flow, clothing_flow, graceful_pose</action>
32
+ <position>upper_body_portrait, close_up, straight_on, from_above, off-center_right</position>
33
+ </character_1>
34
+ <general_tags>
35
+ <style>pro-p style, anime style, painterly, detailed painting, digital painting, masterpiece illustration, professional, artbook illustration</style>
36
+ <background>red_spider_lilies, flower_field, bokeh, depth_of_field, blurry_background, shallow_depth, dark_background, gradient_background, abstract_background, petals_floating, wind_particles</background>
37
+ <atmosphere>dramatic, elegant, mysterious, captivating, alluring, cinematic, stylish, high_fashion, artistic</atmosphere>
38
+ <lighting>dramatic_lighting, rim_lighting, volumetric_lighting, god_rays, chiaroscuro, spotlight, contrast, soft_shadows, sharp_highlights, glowing_edges, lens_flare, atmospheric_light, red_lighting, warm_vs_cold_light, cinematic_lighting, studio_lighting</lighting>
39
+ <quality>masterpiece, best_quality, extremely_detailed, ultra_detailed_cg, 8k, sharp_focus, highres, absurdres, professional, trending_on_artstation, artstation_hd, detailed_skin, detailed_hair, detailed_fabric, detailed_eyes</quality>
40
+ <objects>spider_lilies, red_flowers, shawl, jewelry, earrings, necklace, floating_petals, wind</objects>
41
+ <other>portrait, close-up, upper_body, detailed_portrait, highly_detailed, beautiful_and_detailed, dynamic_composition, elegant_pose, wind_dynamic, fabric_physics, solo, (intricate_design:1.2)</other>
42
+ </general_tags>
43
+ <caption>A breathtakingly detailed pro-p style upper-body close-up portrait of an elegant and mysterious girl. Her flawless face features sharp, captivating pink eyes with shimmering red eyeshadow and glossy lips. Her pristine white hair is styled in perfect double buns with blunt bangs, with long sidelocks and flowing strands lifted by an unseen wind. She wears an intricate, textured red strapless dress with ornate fringe trim, complemented by delicate jewelry and a gracefully flowing red shawl. Her pose is dynamic, with one arm raised and palm upturned in a foreshortened gesture, as she looks up at the viewer with an intense, alluring gaze. The dramatic, cinematic lighting employs strong rim light, chiaroscuro contrasts, and volumetric god rays, illuminating her from above against a shallow depth-of-field background of blurry red spider lilies and floating petals, creating a high-fashion, artistic masterpiece.</caption>"""],
44
+ ["1girl, solo, long hair, breasts, looking at viewer, blue eyes, black hair, hair ornament, dress, holding, closed mouth, jewelry, bare shoulders, upper body, braid, weapon, earrings, sleeveless, sword, white dress, holding weapon, hair bun, bracelet, grey eyes, sleeveless dress, tattoo, sideboob, holding sword, chinese clothes, tassel, sheath, china dress, red nails, side slit, beads, dragon, arm tattoo, arm strap, shoulder tattoo, bead bracelet, tassel earrings, unsheathing, dragon print, eastern dragon, year of the dragon"],
45
+ ["""<character_1>
46
+ <n></n>
47
+ <gender>1girl</gender>
48
+ <appearance>blonde_hair,golden_hair,long_hair,twin_braids,blue_eyes,detailed_eyes,sparkling_eyes,messy_hair,bangs,long_eyelashes,fair_skin,beautiful_face,makeup,nail_polish,white_nails,ring,jewelry</appearance>
49
+ <clothing>maid_headdress,white_frilled_headband,white_dress,frills,ribbon,heart_ornament,gemstone_hair_ornament</clothing>
50
+ <expression>open_mouth,hand_covering_mouth,fingers_on_face,shy,surprised,blush,looking_at_viewer</expression>
51
+ <action>lying_on_back,hands_near_face,face_framing</action>
52
+ <interaction></interaction>
53
+ <position>center,close-up,upper_body</position>
54
+ </character_1>
55
+ <general_tags>
56
+ <count>1girl</count>
57
+ <artists>artist:mika_pikazo</artists>
58
+ <style>anime,vivid_colors,extremely_detailed,colorful,high_contrast,glossy</style>
59
+ <background>light_blue_background,abstract_background</background>
60
+ <environment>surrounded_by_fruit,food_theme,sweets_theme</environment>
61
+ <perspective>from_above,high_angle,looking_down</perspective>
62
+ <atmosphere>cheerful,vibrant,shiny,energetic</atmosphere>
63
+ <lighting>studio_lighting,bright_light,glossy_bouncing_light</lighting>
64
+ <resolution>max_high_resolution</resolution>
65
+ <quality>masterpiece,best_quality,absurdres</quality>
66
+ <objects>strawberries,fruit,whipped_cream,cake,glass_shards,crystal,splashing,heart_gem,floating_objects</objects>
67
+ <other></other>
68
+ </general_tags>"""],
69
+ ]
70
+
71
+ BASE_NEG_PROMPT_PONY7 = "score_6, score_5, score_4, worst quality, low quality, text, deformed, bad hand, blurry, (watermark), extra hands, long ears, ugly, deformed joints, deformed hands, empty background, big ears, narrow face, glowing eyes,"
72
+
73
+ BASE_PROMPT_NETA = "kita ikuyo (Bocchi the Rock!), 1girl, anime style, vibrant colors, red hair, medium hair with one side up, green eyes, bangs, hair between eyes, school uniform (white shirt, grey serafuku sailor collar, red neckerchief, pleated skirt), sitting upper body close-up, holding bouquet with white lily & pink flowers, indoors with depth of field, cherry blossom-like light particles, soft sunlight backlighting, bloom, chromatic aberration & lens flare abuse, light smile, closed mouth, one side hair up, transparent blurry foreground, warm cozy atmosphere, masterpiece, best quality"
pipeline_newbie_img2img.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Alpha-VLLM and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+
21
+ from transformers import (
22
+ PreTrainedModel,
23
+ Gemma3PreTrainedModel,
24
+ GemmaTokenizer,
25
+ GemmaTokenizerFast,
26
+ XLMRobertaTokenizer,
27
+ XLMRobertaTokenizerFast
28
+ )
29
+
30
+ from diffusers.pipelines.pipeline_utils import ImagePipelineOutput
31
+ from diffusers.image_processor import PipelineImageInput
32
+ from diffusers.pipelines.newbie.pipeline_newbie import NewbiePipeline
33
+
34
+ from diffusers.models import AutoencoderKL
35
+ from diffusers.models.transformers.transformer_lumina2 import Lumina2Transformer2DModel
36
+ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
37
+
38
+ from diffusers.utils import (
39
+ is_torch_xla_available,
40
+ logging,
41
+ replace_example_docstring,
42
+ )
43
+ from diffusers.utils.torch_utils import randn_tensor
44
+
45
+
46
+ if is_torch_xla_available():
47
+ import torch_xla.core.xla_model as xm
48
+ XLA_AVAILABLE = True
49
+ else:
50
+ XLA_AVAILABLE = False
51
+
52
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
53
+
54
+
55
+ EXAMPLE_DOC_STRING = """
56
+ Examples:
57
+ ```py
58
+ >>> import torch
59
+ >>> from diffusers import NewbieImg2ImgPipeline
60
+ >>> from diffusers.utils import load_image
61
+ >>> from transformers import AutoModel
62
+
63
+ >>> device = "cuda"
64
+ >>> model_path = "Disty0/NewBie-image-Exp0.1-Diffusers"
65
+ >>> text_encoder_2 = AutoModel.from_pretrained(model_path, subfolder="text_encoder_2", trust_remote_code=True, torch_dtype=torch.bfloat16)
66
+
67
+ >>> pipe = NewbieImg2ImgPipeline.from_pretrained(model_path, text_encoder_2=text_encoder_2, torch_dtype=torch.bfloat16)
68
+ >>> pipe.enable_model_cpu_offload(device=device)
69
+
70
+ >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
71
+ >>> init_image = load_image(url).resize((1024, 1024))
72
+
73
+ >>> prompt = "A fantasy landscape with mountains and a river, detailed, vibrant colors, anime style"
74
+ >>> negative_prompt = "low quality, worst quality, blurry"
75
+
76
+ >>> image = pipe(
77
+ >>> prompt,
78
+ >>> image=init_image,
79
+ >>> strength=0.6,
80
+ >>> negative_prompt=negative_prompt,
81
+ >>> guidance_scale=2.5,
82
+ >>> num_inference_steps=30,
83
+ >>> generator=torch.manual_seed(42),
84
+ >>> ).images[0]
85
+ ```
86
+ """
87
+
88
+
89
+ # Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
90
+ def calculate_shift(
91
+ image_seq_len,
92
+ base_seq_len: int = 256,
93
+ max_seq_len: int = 4096,
94
+ base_shift: float = 0.5,
95
+ max_shift: float = 1.15,
96
+ ):
97
+ m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
98
+ b = base_shift - m * base_seq_len
99
+ mu = image_seq_len * m + b
100
+ return mu
101
+
102
+
103
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
104
+ def retrieve_timesteps(
105
+ scheduler,
106
+ num_inference_steps: Optional[int] = None,
107
+ device: Optional[Union[str, torch.device]] = None,
108
+ timesteps: Optional[List[int]] = None,
109
+ sigmas: Optional[List[float]] = None,
110
+ **kwargs,
111
+ ):
112
+ r"""
113
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
114
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
115
+
116
+ Args:
117
+ scheduler (`SchedulerMixin`):
118
+ The scheduler to get timesteps from.
119
+ num_inference_steps (`int`):
120
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
121
+ must be `None`.
122
+ device (`str` or `torch.device`, *optional*):
123
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
124
+ timesteps (`List[int]`, *optional*):
125
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
126
+ `num_inference_steps` and `sigmas` must be `None`.
127
+ sigmas (`List[float]`, *optional*):
128
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
129
+ `num_inference_steps` and `timesteps` must be `None`.
130
+
131
+ Returns:
132
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
133
+ second element is the number of inference steps.
134
+ """
135
+ if timesteps is not None and sigmas is not None:
136
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
137
+ if timesteps is not None:
138
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
139
+ if not accepts_timesteps:
140
+ raise ValueError(
141
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
142
+ f" timestep schedules. Please check whether you are using the correct scheduler."
143
+ )
144
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
145
+ timesteps = scheduler.timesteps
146
+ num_inference_steps = len(timesteps)
147
+ elif sigmas is not None:
148
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
149
+ if not accept_sigmas:
150
+ raise ValueError(
151
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
152
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
153
+ )
154
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
155
+ timesteps = scheduler.timesteps
156
+ num_inference_steps = len(timesteps)
157
+ else:
158
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
159
+ timesteps = scheduler.timesteps
160
+ return timesteps, num_inference_steps
161
+
162
+
163
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
164
+ def retrieve_latents(
165
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
166
+ ):
167
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
168
+ return encoder_output.latent_dist.sample(generator)
169
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
170
+ return encoder_output.latent_dist.mode()
171
+ elif hasattr(encoder_output, "latents"):
172
+ return encoder_output.latents
173
+ else:
174
+ raise AttributeError("Could not access latents of provided encoder_output")
175
+
176
+
177
+ class NewbieImg2ImgPipeline(NewbiePipeline):
178
+ r"""
179
+ Pipeline for image-to-image generation using Lumina-T2I / Newbie model.
180
+
181
+ This model inherits from [`NewbiePipeline`]. Check the superclass documentation for the generic methods the
182
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
183
+
184
+ Args:
185
+ vae ([`AutoencoderKL`]):
186
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
187
+ text_encoder ([`Gemma3PreTrainedModel`]):
188
+ Frozen Gemma3 text-encoder.
189
+ text_encoder_2 ([`PreTrainedModel`]):
190
+ Frozen JinaCLIPTextModel text-encoder. Requires `trust_remote_code=True`.
191
+ tokenizer (`GemmaTokenizer` or `GemmaTokenizerFast`):
192
+ Gemma tokenizer.
193
+ tokenizer_2 (`XLMRobertaTokenizer` or `XLMRobertaTokenizerFast`):
194
+ XLMRoberta tokenizer.
195
+ transformer ([`Transformer2DModel`]):
196
+ A text conditioned `Transformer2DModel` to denoise the encoded image latents.
197
+ scheduler ([`SchedulerMixin`]):
198
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
199
+ """
200
+
201
+ # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
202
+ def get_timesteps(self, num_inference_steps, strength, device):
203
+ # get the original timestep using init_timestep
204
+ init_timestep = min(num_inference_steps * strength, num_inference_steps)
205
+
206
+ t_start = int(max(num_inference_steps - init_timestep, 0))
207
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
208
+ if hasattr(self.scheduler, "set_begin_index"):
209
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
210
+
211
+ return timesteps, num_inference_steps - t_start
212
+
213
+ def prepare_latents(
214
+ self,
215
+ image,
216
+ timestep,
217
+ batch_size,
218
+ num_channels_latents,
219
+ height,
220
+ width,
221
+ dtype,
222
+ device,
223
+ generator,
224
+ latents=None,
225
+ ):
226
+ if latents is not None:
227
+ return latents.to(device=device, dtype=dtype)
228
+
229
+ # 1. Encode the input image
230
+ image = image.to(device=device, dtype=dtype)
231
+
232
+ if image.shape[1] == num_channels_latents:
233
+ image_latents = image
234
+ else:
235
+ if isinstance(generator, list):
236
+ image_latents = [
237
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
238
+ for i in range(image.shape[0])
239
+ ]
240
+ image_latents = torch.cat(image_latents, dim=0)
241
+ else:
242
+ image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
243
+
244
+ # Apply scaling
245
+ image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
246
+
247
+ # 2. Handle batch size expansion for num_images_per_prompt
248
+ if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
249
+ additional_image_per_prompt = batch_size // image_latents.shape[0]
250
+ image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
251
+ elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
252
+ raise ValueError(
253
+ f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
254
+ )
255
+
256
+ # 3. Add noise to latents
257
+ shape = image_latents.shape
258
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
259
+ latents = self.scheduler.scale_noise(image_latents, timestep, noise)
260
+
261
+ return latents
262
+
263
+ @torch.no_grad()
264
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
265
+ def __call__(
266
+ self,
267
+ prompt: Union[str, List[str]] = None,
268
+ image: PipelineImageInput = None,
269
+ strength: float = 0.6,
270
+ width: Optional[int] = None,
271
+ height: Optional[int] = None,
272
+ num_inference_steps: int = 30,
273
+ guidance_scale: float = 4.0,
274
+ negative_prompt: Union[str, List[str]] = None,
275
+ sigmas: List[float] = None,
276
+ num_images_per_prompt: Optional[int] = 1,
277
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
278
+ latents: Optional[torch.Tensor] = None,
279
+ prompt_embeds: Optional[torch.Tensor] = None,
280
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
281
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
282
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
283
+ prompt_attention_mask: Optional[torch.Tensor] = None,
284
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
285
+ output_type: Optional[str] = "pil",
286
+ return_dict: bool = True,
287
+ attention_kwargs: Optional[Dict[str, Any]] = None,
288
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
289
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
290
+ system_prompt: Optional[str] = None,
291
+ cfg_trunc_ratio: float = 1.0,
292
+ cfg_normalization: bool = True,
293
+ max_sequence_length: int = 512,
294
+ ) -> Union[ImagePipelineOutput, Tuple]:
295
+ """
296
+ Function invoked when calling the pipeline for image-to-image generation.
297
+
298
+ Args:
299
+ prompt (`str` or `List[str]`, *optional*):
300
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
301
+ instead.
302
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
303
+ `Image`, numpy array or tensor representing an image batch to be used as the starting point.
304
+ strength (`float`, *optional*, defaults to 0.6):
305
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
306
+ starting point and more noise is added the higher the `strength`.
307
+ negative_prompt (`str` or `List[str]`, *optional*):
308
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
309
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
310
+ less than `1`).
311
+ num_inference_steps (`int`, *optional*, defaults to 30):
312
+ The number of denoising steps.
313
+ guidance_scale (`float`, *optional*, defaults to 4.0):
314
+ Guidance scale as defined in [Classifier-Free Diffusion
315
+ Guidance](https://huggingface.co/papers/2207.12598).
316
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
317
+ The number of images to generate per prompt.
318
+ height (`int`, *optional*, defaults to self.unet.config.sample_size):
319
+ The height in pixels of the generated image. If not provided, it is inferred from input image.
320
+ width (`int`, *optional*, defaults to self.unet.config.sample_size):
321
+ The width in pixels of the generated image. If not provided, it is inferred from input image.
322
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
323
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
324
+ to make generation deterministic.
325
+ latents (`torch.Tensor`, *optional*):
326
+ Pre-generated noisy latents.
327
+ prompt_embeds (`torch.Tensor`, *optional*):
328
+ Pre-generated text embeddings.
329
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
330
+ Pre-generated pooled text embeddings.
331
+ prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
332
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
333
+ Pre-generated negative text embeddings.
334
+ negative_prompt_attention_mask (`torch.Tensor`, *optional*):
335
+ Pre-generated attention mask for negative text embeddings.
336
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
337
+ Pre-generated negative pooled text embeddings.
338
+ output_type (`str`, *optional*, defaults to `"pil"`):
339
+ The output format of the generate image. Choose between
340
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
341
+ return_dict (`bool`, *optional*, defaults to `True`):
342
+ Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
343
+ attention_kwargs:
344
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor`.
345
+ callback_on_step_end (`Callable`, *optional*):
346
+ A function that calls at the end of each denoising steps during the inference.
347
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
348
+ The list of tensor inputs for the `callback_on_step_end` function.
349
+ system_prompt (`str`, *optional*):
350
+ The system prompt to use for the image generation.
351
+ cfg_trunc_ratio (`float`, *optional*, defaults to `1.0`):
352
+ The ratio of the timestep interval to apply normalization-based guidance scale.
353
+ cfg_normalization (`bool`, *optional*, defaults to `True`):
354
+ Whether to apply normalization-based guidance scale.
355
+ max_sequence_length (`int`, defaults to `512`):
356
+ Maximum sequence length to use with the `prompt`.
357
+
358
+ Examples:
359
+
360
+ Returns:
361
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
362
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
363
+ returned where the first element is a list with the generated images
364
+ """
365
+ # 1. Check strength
366
+ if strength < 0 or strength > 1:
367
+ raise ValueError(f"The value of strength should be in [0.0, 1.0] but is {strength}")
368
+
369
+ # 2. Preprocess image
370
+ init_image = self.image_processor.preprocess(image)
371
+ init_image = init_image.to(dtype=torch.float32)
372
+
373
+ # Get dimensions from image if not specified
374
+ if height is None:
375
+ height = init_image.shape[-2]
376
+ if width is None:
377
+ width = init_image.shape[-1]
378
+
379
+ self._guidance_scale = guidance_scale
380
+ self._attention_kwargs = attention_kwargs
381
+
382
+ # 3. Check inputs. Raise error if not correct
383
+ self.check_inputs(
384
+ prompt,
385
+ height,
386
+ width,
387
+ negative_prompt,
388
+ prompt_embeds=prompt_embeds,
389
+ pooled_prompt_embeds=pooled_prompt_embeds,
390
+ negative_prompt_embeds=negative_prompt_embeds,
391
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
392
+ prompt_attention_mask=prompt_attention_mask,
393
+ negative_prompt_attention_mask=negative_prompt_attention_mask,
394
+ max_sequence_length=max_sequence_length,
395
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
396
+ )
397
+
398
+ # 4. Define call parameters
399
+ if prompt is not None and isinstance(prompt, str):
400
+ batch_size = 1
401
+ elif prompt is not None and isinstance(prompt, list):
402
+ batch_size = len(prompt)
403
+ else:
404
+ batch_size = prompt_embeds.shape[0]
405
+
406
+ device = self._execution_device
407
+
408
+ # 5. Encode input prompt
409
+ (
410
+ prompt_embeds,
411
+ pooled_prompt_embeds,
412
+ prompt_attention_mask,
413
+ negative_prompt_embeds,
414
+ negative_pooled_prompt_embeds,
415
+ negative_prompt_attention_mask,
416
+ ) = self.encode_prompt(
417
+ prompt,
418
+ self.do_classifier_free_guidance,
419
+ negative_prompt=negative_prompt,
420
+ num_images_per_prompt=num_images_per_prompt,
421
+ device=device,
422
+ prompt_embeds=prompt_embeds,
423
+ pooled_prompt_embeds=pooled_prompt_embeds,
424
+ negative_prompt_embeds=negative_prompt_embeds,
425
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
426
+ prompt_attention_mask=prompt_attention_mask,
427
+ negative_prompt_attention_mask=negative_prompt_attention_mask,
428
+ max_sequence_length=max_sequence_length,
429
+ system_prompt=system_prompt,
430
+ )
431
+
432
+ # 6. Prepare timesteps
433
+ full_sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
434
+
435
+ latent_height = height // (self.vae_scale_factor * 2) * 2
436
+ latent_width = width // (self.vae_scale_factor * 2) * 2
437
+ image_seq_len = (latent_height // 2) * (latent_width // 2)
438
+
439
+ mu = calculate_shift(
440
+ image_seq_len,
441
+ self.scheduler.config.get("base_image_seq_len", 256),
442
+ self.scheduler.config.get("max_image_seq_len", 4096),
443
+ self.scheduler.config.get("base_shift", 0.5),
444
+ self.scheduler.config.get("max_shift", 1.15),
445
+ )
446
+
447
+ timesteps, num_inference_steps = retrieve_timesteps(
448
+ self.scheduler,
449
+ num_inference_steps,
450
+ device,
451
+ sigmas=full_sigmas,
452
+ mu=mu,
453
+ )
454
+
455
+ # 7. Adjust timesteps based on strength
456
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
457
+ if num_inference_steps < 1:
458
+ raise ValueError(
459
+ f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline "
460
+ f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
461
+ )
462
+
463
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
464
+
465
+ # 8. Prepare latents
466
+ latents = self.prepare_latents(
467
+ init_image,
468
+ latent_timestep,
469
+ batch_size * num_images_per_prompt,
470
+ self.transformer.config.in_channels,
471
+ height,
472
+ width,
473
+ prompt_embeds.dtype,
474
+ device,
475
+ generator,
476
+ latents,
477
+ )
478
+
479
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
480
+ self._num_timesteps = len(timesteps)
481
+
482
+ # 9. Denoising loop
483
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
484
+ for i, t in enumerate(timesteps):
485
+ # compute whether apply classifier-free truncation on this timestep
486
+ do_classifier_free_truncation = (i + 1) / num_inference_steps > cfg_trunc_ratio
487
+
488
+ # reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
489
+ current_timestep = 1 - t / self.scheduler.config.num_train_timesteps
490
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
491
+ current_timestep = current_timestep.expand(latents.shape[0])
492
+
493
+ noise_pred_cond = self.transformer(
494
+ hidden_states=latents,
495
+ timestep=current_timestep,
496
+ encoder_hidden_states=prompt_embeds,
497
+ pooled_projections=pooled_prompt_embeds,
498
+ encoder_attention_mask=prompt_attention_mask,
499
+ return_dict=False,
500
+ attention_kwargs=self.attention_kwargs,
501
+ )[0]
502
+
503
+ # perform normalization-based guidance scale on a truncated timestep interval
504
+ if self.do_classifier_free_guidance and not do_classifier_free_truncation:
505
+ noise_pred_uncond = self.transformer(
506
+ hidden_states=latents,
507
+ timestep=current_timestep,
508
+ encoder_hidden_states=negative_prompt_embeds,
509
+ pooled_projections=negative_pooled_prompt_embeds,
510
+ encoder_attention_mask=negative_prompt_attention_mask,
511
+ return_dict=False,
512
+ attention_kwargs=self.attention_kwargs,
513
+ )[0]
514
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
515
+ # apply normalization after classifier-free guidance
516
+ if cfg_normalization:
517
+ cond_norm = torch.norm(noise_pred_cond, dim=-1, keepdim=True)
518
+ noise_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
519
+ noise_pred = noise_pred * (cond_norm / noise_norm)
520
+ else:
521
+ noise_pred = noise_pred_cond
522
+
523
+ # compute the previous noisy sample x_t -> x_t-1
524
+ latents_dtype = latents.dtype
525
+ noise_pred = -noise_pred
526
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
527
+
528
+ if latents.dtype != latents_dtype:
529
+ if torch.backends.mps.is_available():
530
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
531
+ latents = latents.to(latents_dtype)
532
+
533
+ if callback_on_step_end is not None:
534
+ callback_kwargs = {}
535
+ for k in callback_on_step_end_tensor_inputs:
536
+ callback_kwargs[k] = locals()[k]
537
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
538
+
539
+ latents = callback_outputs.pop("latents", latents)
540
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
541
+ pooled_prompt_embeds = callback_outputs.pop("pooled_prompt_embeds", pooled_prompt_embeds)
542
+
543
+ # call the callback, if provided
544
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
545
+ progress_bar.update()
546
+
547
+ if XLA_AVAILABLE:
548
+ xm.mark_step()
549
+
550
+ if not output_type == "latent":
551
+ latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
552
+ image = self.vae.decode(latents, return_dict=False)[0]
553
+ image = self.image_processor.postprocess(image, output_type=output_type)
554
+ else:
555
+ image = latents
556
+
557
+ # Offload all models
558
+ self.maybe_free_model_hooks()
559
+
560
+ if not return_dict:
561
+ return (image,)
562
+
563
+ return ImagePipelineOutput(images=image)
pre-requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ stablepy==0.6.5
requirements.txt CHANGED
@@ -6,4 +6,7 @@ accelerate
6
  timm
7
  torchvision
8
  einops
9
- kernels
 
 
 
 
6
  timm
7
  torchvision
8
  einops
9
+ torchao==0.11.0
10
+ kernels
11
+ torchsde>=0.2.6
12
+ accelerate==1.12.0