akhaliq HF Staff commited on
Commit
81f9012
·
1 Parent(s): f4523d4

WIP: text-to-music feature

Browse files
Files changed (1) hide show
  1. app.py +242 -17
app.py CHANGED
@@ -146,6 +146,61 @@ def reap_old_videos(ttl_seconds: int = VIDEO_FILE_TTL_SECONDS) -> None:
146
  # Temp dir might not exist or be accessible; ignore
147
  pass
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  TRANSFORMERS_JS_SYSTEM_PROMPT = """You are an expert web developer creating a transformers.js application. You will generate THREE separate files: index.html, index.js, and style.css.
150
 
151
  IMPORTANT: You MUST output ALL THREE files in the following format:
@@ -1524,6 +1579,68 @@ def generate_video_from_text(prompt: str, session_id: Optional[str] = None) -> s
1524
  print(f"Text-to-video generation error: {str(e)}")
1525
  return f"Error generating video (text-to-video): {str(e)}"
1526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1527
  def extract_image_prompts_from_text(text: str, num_images_needed: int = 1) -> list:
1528
  """Extract image generation prompts from the full text based on number of images needed"""
1529
  # Use the entire text as the base prompt for image generation
@@ -1816,6 +1933,53 @@ def create_video_replacement_blocks_text_to_video(html_content: str, prompt: str
1816
  # If no <body>, just append
1817
  return f"{SEARCH_START}\n\n{DIVIDER}\n{video_html}\n{REPLACE_END}"
1818
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1819
  def create_image_replacement_blocks_from_input_image(html_content: str, user_prompt: str, input_image_data, max_images: int = 1) -> str:
1820
  """Create search/replace blocks using image-to-image generation with a provided input image.
1821
 
@@ -1988,7 +2152,7 @@ def create_video_replacement_blocks_from_input_image(html_content: str, user_pro
1988
  print("[Image2Video] No <body> tag; appending video via replacement block")
1989
  return f"{SEARCH_START}\n\n{DIVIDER}\n{video_html}\n{REPLACE_END}"
1990
 
1991
- def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_text_to_image: bool, enable_image_to_image: bool, input_image_data, image_to_image_prompt: str | None = None, text_to_image_prompt: str | None = None, enable_image_to_video: bool = False, image_to_video_prompt: str | None = None, session_id: Optional[str] = None, enable_text_to_video: bool = False, text_to_video_prompt: str | None = None) -> str:
1992
  """Apply text-to-image and/or image-to-image replacements to HTML content.
1993
 
1994
  If both toggles are enabled, text-to-image replacements run first, then image-to-image.
@@ -1997,7 +2161,7 @@ def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_t
1997
  try:
1998
  print(
1999
  f"[MediaApply] enable_i2v={enable_image_to_video}, enable_i2i={enable_image_to_image}, "
2000
- f"enable_t2i={enable_text_to_image}, has_image={input_image_data is not None}"
2001
  )
2002
  # If image-to-video is enabled, replace the first image with a generated video and return.
2003
  if enable_image_to_video and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
@@ -2035,6 +2199,18 @@ def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_t
2035
  print("[MediaApply] No t2v replacement blocks generated")
2036
  return result
2037
 
 
 
 
 
 
 
 
 
 
 
 
 
2038
  # If an input image is provided and image-to-image is enabled, we only replace one image
2039
  # and skip text-to-image to satisfy the requirement to replace exactly the number of uploaded images.
2040
  if enable_image_to_image and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
@@ -2883,7 +3059,7 @@ The HTML code above contains the complete original website structure with all im
2883
  stop_generation = False
2884
 
2885
 
2886
- def generation_code(query: Optional[str], vlm_image: Optional[gr.Image], gen_image: Optional[gr.Image], file: Optional[str], website_url: Optional[str], _setting: Dict[str, str], _history: Optional[History], _current_model: Dict, enable_search: bool = False, language: str = "html", provider: str = "auto", enable_image_generation: bool = False, enable_image_to_image: bool = False, image_to_image_prompt: Optional[str] = None, text_to_image_prompt: Optional[str] = None, enable_image_to_video: bool = False, image_to_video_prompt: Optional[str] = None, enable_text_to_video: bool = False, text_to_video_prompt: Optional[str] = None):
2887
  if query is None:
2888
  query = ''
2889
  if _history is None:
@@ -2923,7 +3099,9 @@ def generation_code(query: Optional[str], vlm_image: Optional[gr.Image], gen_ima
2923
  # On each generate, reap old global files and cleanup previous session files
2924
  try:
2925
  cleanup_session_videos(session_id)
 
2926
  reap_old_videos()
 
2927
  except Exception:
2928
  pass
2929
 
@@ -3023,9 +3201,9 @@ This will help me create a better design for you."""
3023
 
3024
  clean_code = remove_code_block(content)
3025
 
3026
- # Apply image generation (text→image and/or image→image)
3027
  print("[Generate] Applying post-generation media to GLM-4.5 HTML output")
3028
- final_content = apply_generated_images_to_html(
3029
  content,
3030
  query,
3031
  enable_text_to_image=enable_image_generation,
@@ -3037,6 +3215,8 @@ This will help me create a better design for you."""
3037
  session_id=session_id,
3038
  enable_text_to_video=enable_text_to_video,
3039
  text_to_video_prompt=text_to_video_prompt,
 
 
3040
  )
3041
 
3042
  _history.append([query, final_content])
@@ -3190,9 +3370,9 @@ This will help me create a better design for you."""
3190
  modified_content = apply_search_replace_changes(last_content, clean_code)
3191
  clean_content = remove_code_block(modified_content)
3192
 
3193
- # Apply image generation (text→image and/or image→image)
3194
  print("[Generate] Applying post-generation media to modified HTML content")
3195
- clean_content = apply_generated_images_to_html(
3196
  clean_content,
3197
  query,
3198
  enable_text_to_image=enable_image_generation,
@@ -3204,6 +3384,8 @@ This will help me create a better design for you."""
3204
  session_id=session_id,
3205
  enable_text_to_video=enable_text_to_video,
3206
  text_to_video_prompt=text_to_video_prompt,
 
 
3207
  )
3208
 
3209
  yield {
@@ -3213,9 +3395,9 @@ This will help me create a better design for you."""
3213
  history_output: history_to_chatbot_messages(_history),
3214
  }
3215
  else:
3216
- # Apply image generation (text→image and/or image→image)
3217
  print("[Generate] Applying post-generation media to new HTML content")
3218
- final_content = apply_generated_images_to_html(
3219
  clean_code,
3220
  query,
3221
  enable_text_to_image=enable_image_generation,
@@ -3228,6 +3410,8 @@ This will help me create a better design for you."""
3228
  session_id=session_id,
3229
  enable_text_to_video=enable_text_to_video,
3230
  text_to_video_prompt=text_to_video_prompt,
 
 
3231
  )
3232
 
3233
  preview_val = None
@@ -3615,9 +3799,9 @@ This will help me create a better design for you."""
3615
  modified_content = apply_search_replace_changes(last_content, final_code)
3616
  clean_content = remove_code_block(modified_content)
3617
 
3618
- # Apply image generation (text→image and/or image→image)
3619
  print("[Generate] Applying post-generation media to follow-up HTML content")
3620
- clean_content = apply_generated_images_to_html(
3621
  clean_content,
3622
  query,
3623
  enable_text_to_image=enable_image_generation,
@@ -3630,6 +3814,8 @@ This will help me create a better design for you."""
3630
  text_to_image_prompt=text_to_image_prompt,
3631
  enable_text_to_video=enable_text_to_video,
3632
  text_to_video_prompt=text_to_video_prompt,
 
 
3633
  )
3634
 
3635
  # Update history with the cleaned content
@@ -3644,9 +3830,9 @@ This will help me create a better design for you."""
3644
  # Regular generation - use the content as is
3645
  final_content = remove_code_block(content)
3646
 
3647
- # Apply image generation (text→image and/or image→image)
3648
  print("[Generate] Applying post-generation media to final HTML content")
3649
- final_content = apply_generated_images_to_html(
3650
  final_content,
3651
  query,
3652
  enable_text_to_image=enable_image_generation,
@@ -3659,6 +3845,8 @@ This will help me create a better design for you."""
3659
  session_id=session_id,
3660
  enable_text_to_video=enable_text_to_video,
3661
  text_to_video_prompt=text_to_video_prompt,
 
 
3662
  )
3663
 
3664
  _history.append([query, final_content])
@@ -4853,6 +5041,20 @@ with gr.Blocks(
4853
  visible=False
4854
  )
4855
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4856
  def on_image_to_image_toggle(toggled, beta_enabled):
4857
  # Only show in classic mode (beta disabled)
4858
  vis = bool(toggled) and not bool(beta_enabled)
@@ -4886,6 +5088,11 @@ with gr.Blocks(
4886
  inputs=[text_to_video_toggle, beta_toggle],
4887
  outputs=[text_to_video_prompt]
4888
  )
 
 
 
 
 
4889
  model_dropdown = gr.Dropdown(
4890
  choices=[model['name'] for model in AVAILABLE_MODELS],
4891
  value=DEFAULT_MODEL_NAME,
@@ -5136,7 +5343,7 @@ with gr.Blocks(
5136
  show_progress="hidden",
5137
  ).then(
5138
  generation_code,
5139
- inputs=[input, image_input, generation_image_input, file_input, website_url_input, setting, history, current_model, search_toggle, language_dropdown, provider_state, image_generation_toggle, image_to_image_toggle, image_to_image_prompt, text_to_image_prompt, image_to_video_toggle, image_to_video_prompt, text_to_video_toggle, text_to_video_prompt],
5140
  outputs=[code_output, history, sandbox, history_output]
5141
  ).then(
5142
  end_generation_ui,
@@ -5188,6 +5395,8 @@ with gr.Blocks(
5188
  upd_t2v_prompt = gr.skip()
5189
  upd_model_dropdown = gr.skip()
5190
  upd_current_model = gr.skip()
 
 
5191
 
5192
  # Split by comma to separate main prompt and directives
5193
  segments = [seg.strip() for seg in (text or "").split(",") if seg.strip()]
@@ -5253,6 +5462,13 @@ with gr.Blocks(
5253
  if p:
5254
  upd_t2v_prompt = gr.update(value=p)
5255
 
 
 
 
 
 
 
 
5256
  # URL (website redesign)
5257
  url = _extract_url(seg)
5258
  if url:
@@ -5317,6 +5533,8 @@ with gr.Blocks(
5317
  upd_t2v_prompt,
5318
  upd_model_dropdown,
5319
  upd_current_model,
 
 
5320
  )
5321
 
5322
  # Wire chat submit -> apply settings -> run generation
@@ -5342,6 +5560,8 @@ with gr.Blocks(
5342
  text_to_video_prompt,
5343
  model_dropdown,
5344
  current_model,
 
 
5345
  ],
5346
  queue=False,
5347
  ).then(
@@ -5351,7 +5571,7 @@ with gr.Blocks(
5351
  show_progress="hidden",
5352
  ).then(
5353
  generation_code,
5354
- inputs=[input, image_input, generation_image_input, file_input, website_url_input, setting, history, current_model, search_toggle, language_dropdown, provider_state, image_generation_toggle, image_to_image_toggle, image_to_image_prompt, text_to_image_prompt, image_to_video_toggle, image_to_video_prompt, text_to_video_toggle, text_to_video_prompt],
5355
  outputs=[code_output, history, sandbox, history_output]
5356
  ).then(
5357
  end_generation_ui,
@@ -5368,12 +5588,13 @@ with gr.Blocks(
5368
  )
5369
 
5370
  # Toggle between classic controls and beta chat UI
5371
- def toggle_beta(checked: bool, t2i: bool, i2i: bool, i2v: bool, t2v: bool):
5372
  # Prompts only visible in classic mode and when their toggles are on
5373
  t2i_vis = (not checked) and bool(t2i)
5374
  i2i_vis = (not checked) and bool(i2i)
5375
  i2v_vis = (not checked) and bool(i2v)
5376
  t2v_vis = (not checked) and bool(t2v)
 
5377
 
5378
  return (
5379
  # Chat UI group
@@ -5397,6 +5618,8 @@ with gr.Blocks(
5397
  gr.update(visible=i2v_vis), # image_to_video_prompt
5398
  gr.update(visible=not checked), # text_to_video_toggle
5399
  gr.update(visible=t2v_vis), # text_to_video_prompt
 
 
5400
  gr.update(visible=not checked), # model_dropdown
5401
  gr.update(visible=not checked), # quick_start_md
5402
  gr.update(visible=not checked), # quick_examples_col
@@ -5404,7 +5627,7 @@ with gr.Blocks(
5404
 
5405
  beta_toggle.change(
5406
  toggle_beta,
5407
- inputs=[beta_toggle, image_generation_toggle, image_to_image_toggle, image_to_video_toggle, text_to_video_toggle],
5408
  outputs=[
5409
  sidebar_chatbot,
5410
  sidebar_msg,
@@ -5425,6 +5648,8 @@ with gr.Blocks(
5425
  image_to_video_prompt,
5426
  text_to_video_toggle,
5427
  text_to_video_prompt,
 
 
5428
  model_dropdown,
5429
  quick_start_md,
5430
  quick_examples_col,
 
146
  # Temp dir might not exist or be accessible; ignore
147
  pass
148
 
149
+ # ---------------------------------------------------------------------------
150
+ # Audio temp-file management (per-session tracking and cleanup)
151
+ # ---------------------------------------------------------------------------
152
+ AUDIO_TEMP_DIR = os.path.join(tempfile.gettempdir(), "anycoder_audio")
153
+ AUDIO_FILE_TTL_SECONDS = 6 * 60 * 60 # 6 hours
154
+ _SESSION_AUDIO_FILES: Dict[str, List[str]] = {}
155
+ _AUDIO_FILES_LOCK = threading.Lock()
156
+
157
+
158
+ def _ensure_audio_dir_exists() -> None:
159
+ try:
160
+ os.makedirs(AUDIO_TEMP_DIR, exist_ok=True)
161
+ except Exception:
162
+ pass
163
+
164
+
165
+ def _register_audio_for_session(session_id: Optional[str], file_path: str) -> None:
166
+ if not session_id or not file_path:
167
+ return
168
+ with _AUDIO_FILES_LOCK:
169
+ if session_id not in _SESSION_AUDIO_FILES:
170
+ _SESSION_AUDIO_FILES[session_id] = []
171
+ _SESSION_AUDIO_FILES[session_id].append(file_path)
172
+
173
+
174
+ def cleanup_session_audio(session_id: Optional[str]) -> None:
175
+ if not session_id:
176
+ return
177
+ with _AUDIO_FILES_LOCK:
178
+ file_list = _SESSION_AUDIO_FILES.pop(session_id, [])
179
+ for path in file_list:
180
+ try:
181
+ if path and os.path.exists(path):
182
+ os.unlink(path)
183
+ except Exception:
184
+ pass
185
+
186
+
187
+ def reap_old_audio(ttl_seconds: int = AUDIO_FILE_TTL_SECONDS) -> None:
188
+ try:
189
+ _ensure_audio_dir_exists()
190
+ now_ts = time.time()
191
+ for name in os.listdir(AUDIO_TEMP_DIR):
192
+ path = os.path.join(AUDIO_TEMP_DIR, name)
193
+ try:
194
+ if not os.path.isfile(path):
195
+ continue
196
+ mtime = os.path.getmtime(path)
197
+ if now_ts - mtime > ttl_seconds:
198
+ os.unlink(path)
199
+ except Exception:
200
+ pass
201
+ except Exception:
202
+ pass
203
+
204
  TRANSFORMERS_JS_SYSTEM_PROMPT = """You are an expert web developer creating a transformers.js application. You will generate THREE separate files: index.html, index.js, and style.css.
205
 
206
  IMPORTANT: You MUST output ALL THREE files in the following format:
 
1579
  print(f"Text-to-video generation error: {str(e)}")
1580
  return f"Error generating video (text-to-video): {str(e)}"
1581
 
1582
+ def generate_music_from_text(prompt: str, music_length_ms: int = 30000, session_id: Optional[str] = None) -> str:
1583
+ """Generate music from a text prompt using ElevenLabs Music API and return an HTML <audio> tag.
1584
+
1585
+ Saves audio to a temp file and references it via file:// URL similar to videos.
1586
+ Requires ELEVENLABS_API_KEY in the environment.
1587
+ """
1588
+ try:
1589
+ api_key = os.getenv('ELEVENLABS_API_KEY')
1590
+ if not api_key:
1591
+ return "Error: ELEVENLABS_API_KEY environment variable is not set."
1592
+
1593
+ headers = {
1594
+ 'Content-Type': 'application/json',
1595
+ 'xi-api-key': api_key,
1596
+ }
1597
+ payload = {
1598
+ 'prompt': (prompt or 'Epic orchestral theme with soaring strings and powerful brass'),
1599
+ 'music_length_ms': int(music_length_ms) if music_length_ms else 30000,
1600
+ }
1601
+
1602
+ resp = requests.post('https://api.elevenlabs.io/v1/music/compose', headers=headers, json=payload)
1603
+ try:
1604
+ resp.raise_for_status()
1605
+ except Exception as e:
1606
+ return f"Error generating music: {getattr(e, 'response', resp).text if hasattr(e, 'response') else resp.text}"
1607
+
1608
+ # Persist audio to temp file and return an <audio> element using file:// URL
1609
+ _ensure_audio_dir_exists()
1610
+ file_name = f"{uuid.uuid4()}.wav"
1611
+ file_path = os.path.join(AUDIO_TEMP_DIR, file_name)
1612
+ try:
1613
+ with open(file_path, 'wb') as f:
1614
+ f.write(resp.content)
1615
+ _register_audio_for_session(session_id, file_path)
1616
+ except Exception as save_exc:
1617
+ return f"Error generating music: could not save audio file ({save_exc})"
1618
+
1619
+ # Build file URI
1620
+ try:
1621
+ from pathlib import Path
1622
+ file_url = Path(file_path).as_uri()
1623
+ except Exception:
1624
+ if file_path.startswith('/'):
1625
+ file_url = f"file:///{file_path.lstrip('/')}"
1626
+ else:
1627
+ file_url = f"file:///{file_path}"
1628
+
1629
+ audio_html = (
1630
+ "<div class=\"anycoder-music\" style=\"max-width:420px;margin:16px auto;padding:12px 16px;border:1px solid #e5e7eb;border-radius:12px;background:linear-gradient(180deg,#fafafa,#f3f4f6);box-shadow:0 2px 8px rgba(0,0,0,0.06)\">"
1631
+ " <div style=\"font-size:13px;color:#374151;margin-bottom:8px;display:flex;align-items:center;gap:6px\">"
1632
+ " <span>🎵 Generated music</span>"
1633
+ " </div>"
1634
+ f" <audio controls autoplay loop style=\"width:100%;outline:none;\">"
1635
+ f" <source src=\"{file_url}\" type=\"audio/wav\" />"
1636
+ " Your browser does not support the audio element."
1637
+ " </audio>"
1638
+ "</div>"
1639
+ )
1640
+ return audio_html
1641
+ except Exception as e:
1642
+ return f"Error generating music: {str(e)}"
1643
+
1644
  def extract_image_prompts_from_text(text: str, num_images_needed: int = 1) -> list:
1645
  """Extract image generation prompts from the full text based on number of images needed"""
1646
  # Use the entire text as the base prompt for image generation
 
1933
  # If no <body>, just append
1934
  return f"{SEARCH_START}\n\n{DIVIDER}\n{video_html}\n{REPLACE_END}"
1935
 
1936
+ def create_music_replacement_blocks_text_to_music(html_content: str, prompt: str, session_id: Optional[str] = None) -> str:
1937
+ """Create search/replace blocks that insert ONE generated <audio> near the top of <body>.
1938
+
1939
+ Unlike images/videos which replace placeholders, music doesn't map to an <img> tag.
1940
+ We simply insert an <audio> player after the opening <body>.
1941
+ """
1942
+ if not prompt or not prompt.strip():
1943
+ return ""
1944
+
1945
+ audio_html = generate_music_from_text(prompt, session_id=session_id)
1946
+ if audio_html.startswith("Error"):
1947
+ return ""
1948
+
1949
+ # Prefer inserting after the first <section>...</section> if present; else after <body>
1950
+ import re
1951
+ section_match = re.search(r"<section\b[\s\S]*?</section>", html_content, flags=re.IGNORECASE)
1952
+ if section_match:
1953
+ section_html = section_match.group(0)
1954
+ section_clean = re.sub(r"\s+", " ", section_html.strip())
1955
+ variations = [
1956
+ section_html,
1957
+ section_clean,
1958
+ section_clean.replace('"', "'"),
1959
+ section_clean.replace("'", '"'),
1960
+ re.sub(r"\s+", " ", section_clean),
1961
+ ]
1962
+ blocks = []
1963
+ for v in variations:
1964
+ blocks.append(f"""{SEARCH_START}
1965
+ {v}
1966
+ {DIVIDER}
1967
+ {v}\n {audio_html}
1968
+ {REPLACE_END}""")
1969
+ return "\n\n".join(blocks)
1970
+ if '<body' in html_content:
1971
+ body_end = html_content.find('>', html_content.find('<body')) + 1
1972
+ insertion_point = html_content[:body_end] + '\n '
1973
+ return f"""{SEARCH_START}
1974
+ {insertion_point}
1975
+ {DIVIDER}
1976
+ {insertion_point}
1977
+ {audio_html}
1978
+ {REPLACE_END}"""
1979
+
1980
+ # If no <body>, just append
1981
+ return f"{SEARCH_START}\n\n{DIVIDER}\n{audio_html}\n{REPLACE_END}"
1982
+
1983
  def create_image_replacement_blocks_from_input_image(html_content: str, user_prompt: str, input_image_data, max_images: int = 1) -> str:
1984
  """Create search/replace blocks using image-to-image generation with a provided input image.
1985
 
 
2152
  print("[Image2Video] No <body> tag; appending video via replacement block")
2153
  return f"{SEARCH_START}\n\n{DIVIDER}\n{video_html}\n{REPLACE_END}"
2154
 
2155
+ def apply_generated_media_to_html(html_content: str, user_prompt: str, enable_text_to_image: bool, enable_image_to_image: bool, input_image_data, image_to_image_prompt: str | None = None, text_to_image_prompt: str | None = None, enable_image_to_video: bool = False, image_to_video_prompt: str | None = None, session_id: Optional[str] = None, enable_text_to_video: bool = False, text_to_video_prompt: str | None = None, enable_text_to_music: bool = False, text_to_music_prompt: str | None = None) -> str:
2156
  """Apply text-to-image and/or image-to-image replacements to HTML content.
2157
 
2158
  If both toggles are enabled, text-to-image replacements run first, then image-to-image.
 
2161
  try:
2162
  print(
2163
  f"[MediaApply] enable_i2v={enable_image_to_video}, enable_i2i={enable_image_to_image}, "
2164
+ f"enable_t2i={enable_text_to_image}, enable_t2v={enable_text_to_video}, enable_t2m={enable_text_to_music}, has_image={input_image_data is not None}"
2165
  )
2166
  # If image-to-video is enabled, replace the first image with a generated video and return.
2167
  if enable_image_to_video and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
 
2199
  print("[MediaApply] No t2v replacement blocks generated")
2200
  return result
2201
 
2202
+ # If text-to-music is enabled, insert a generated audio player near the top of body and return.
2203
+ if enable_text_to_music and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
2204
+ t2m_prompt = (text_to_music_prompt or user_prompt or "").strip()
2205
+ print(f"[MediaApply] Running text-to-music with prompt len={len(t2m_prompt)}")
2206
+ blocks_tm = create_music_replacement_blocks_text_to_music(result, t2m_prompt, session_id=session_id)
2207
+ if blocks_tm:
2208
+ print("[MediaApply] Applying text-to-music replacement blocks")
2209
+ result = apply_search_replace_changes(result, blocks_tm)
2210
+ else:
2211
+ print("[MediaApply] No t2m replacement blocks generated")
2212
+ return result
2213
+
2214
  # If an input image is provided and image-to-image is enabled, we only replace one image
2215
  # and skip text-to-image to satisfy the requirement to replace exactly the number of uploaded images.
2216
  if enable_image_to_image and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
 
3059
  stop_generation = False
3060
 
3061
 
3062
+ def generation_code(query: Optional[str], vlm_image: Optional[gr.Image], gen_image: Optional[gr.Image], file: Optional[str], website_url: Optional[str], _setting: Dict[str, str], _history: Optional[History], _current_model: Dict, enable_search: bool = False, language: str = "html", provider: str = "auto", enable_image_generation: bool = False, enable_image_to_image: bool = False, image_to_image_prompt: Optional[str] = None, text_to_image_prompt: Optional[str] = None, enable_image_to_video: bool = False, image_to_video_prompt: Optional[str] = None, enable_text_to_video: bool = False, text_to_video_prompt: Optional[str] = None, enable_text_to_music: bool = False, text_to_music_prompt: Optional[str] = None):
3063
  if query is None:
3064
  query = ''
3065
  if _history is None:
 
3099
  # On each generate, reap old global files and cleanup previous session files
3100
  try:
3101
  cleanup_session_videos(session_id)
3102
+ cleanup_session_audio(session_id)
3103
  reap_old_videos()
3104
+ reap_old_audio()
3105
  except Exception:
3106
  pass
3107
 
 
3201
 
3202
  clean_code = remove_code_block(content)
3203
 
3204
+ # Apply media generation (images/video/music)
3205
  print("[Generate] Applying post-generation media to GLM-4.5 HTML output")
3206
+ final_content = apply_generated_media_to_html(
3207
  content,
3208
  query,
3209
  enable_text_to_image=enable_image_generation,
 
3215
  session_id=session_id,
3216
  enable_text_to_video=enable_text_to_video,
3217
  text_to_video_prompt=text_to_video_prompt,
3218
+ enable_text_to_music=enable_text_to_music,
3219
+ text_to_music_prompt=text_to_music_prompt,
3220
  )
3221
 
3222
  _history.append([query, final_content])
 
3370
  modified_content = apply_search_replace_changes(last_content, clean_code)
3371
  clean_content = remove_code_block(modified_content)
3372
 
3373
+ # Apply media generation (images/video/music)
3374
  print("[Generate] Applying post-generation media to modified HTML content")
3375
+ clean_content = apply_generated_media_to_html(
3376
  clean_content,
3377
  query,
3378
  enable_text_to_image=enable_image_generation,
 
3384
  session_id=session_id,
3385
  enable_text_to_video=enable_text_to_video,
3386
  text_to_video_prompt=text_to_video_prompt,
3387
+ enable_text_to_music=enable_text_to_music,
3388
+ text_to_music_prompt=text_to_music_prompt,
3389
  )
3390
 
3391
  yield {
 
3395
  history_output: history_to_chatbot_messages(_history),
3396
  }
3397
  else:
3398
+ # Apply media generation (images/video/music)
3399
  print("[Generate] Applying post-generation media to new HTML content")
3400
+ final_content = apply_generated_media_to_html(
3401
  clean_code,
3402
  query,
3403
  enable_text_to_image=enable_image_generation,
 
3410
  session_id=session_id,
3411
  enable_text_to_video=enable_text_to_video,
3412
  text_to_video_prompt=text_to_video_prompt,
3413
+ enable_text_to_music=enable_text_to_music,
3414
+ text_to_music_prompt=text_to_music_prompt,
3415
  )
3416
 
3417
  preview_val = None
 
3799
  modified_content = apply_search_replace_changes(last_content, final_code)
3800
  clean_content = remove_code_block(modified_content)
3801
 
3802
+ # Apply media generation (images/video/music)
3803
  print("[Generate] Applying post-generation media to follow-up HTML content")
3804
+ clean_content = apply_generated_media_to_html(
3805
  clean_content,
3806
  query,
3807
  enable_text_to_image=enable_image_generation,
 
3814
  text_to_image_prompt=text_to_image_prompt,
3815
  enable_text_to_video=enable_text_to_video,
3816
  text_to_video_prompt=text_to_video_prompt,
3817
+ enable_text_to_music=enable_text_to_music,
3818
+ text_to_music_prompt=text_to_music_prompt,
3819
  )
3820
 
3821
  # Update history with the cleaned content
 
3830
  # Regular generation - use the content as is
3831
  final_content = remove_code_block(content)
3832
 
3833
+ # Apply media generation (images/video/music)
3834
  print("[Generate] Applying post-generation media to final HTML content")
3835
+ final_content = apply_generated_media_to_html(
3836
  final_content,
3837
  query,
3838
  enable_text_to_image=enable_image_generation,
 
3845
  session_id=session_id,
3846
  enable_text_to_video=enable_text_to_video,
3847
  text_to_video_prompt=text_to_video_prompt,
3848
+ enable_text_to_music=enable_text_to_music,
3849
+ text_to_music_prompt=text_to_music_prompt,
3850
  )
3851
 
3852
  _history.append([query, final_content])
 
5041
  visible=False
5042
  )
5043
 
5044
+ # Text-to-Music
5045
+ text_to_music_toggle = gr.Checkbox(
5046
+ label="🎵 Generate Music (text → music)",
5047
+ value=False,
5048
+ visible=True,
5049
+ info="Compose short music from your prompt using ElevenLabs Music"
5050
+ )
5051
+ text_to_music_prompt = gr.Textbox(
5052
+ label="Text-to-Music Prompt",
5053
+ placeholder="Describe the music to generate (e.g., 'Epic orchestral theme with soaring strings and powerful brass')",
5054
+ lines=2,
5055
+ visible=False
5056
+ )
5057
+
5058
  def on_image_to_image_toggle(toggled, beta_enabled):
5059
  # Only show in classic mode (beta disabled)
5060
  vis = bool(toggled) and not bool(beta_enabled)
 
5088
  inputs=[text_to_video_toggle, beta_toggle],
5089
  outputs=[text_to_video_prompt]
5090
  )
5091
+ text_to_music_toggle.change(
5092
+ on_text_to_image_toggle,
5093
+ inputs=[text_to_music_toggle, beta_toggle],
5094
+ outputs=[text_to_music_prompt]
5095
+ )
5096
  model_dropdown = gr.Dropdown(
5097
  choices=[model['name'] for model in AVAILABLE_MODELS],
5098
  value=DEFAULT_MODEL_NAME,
 
5343
  show_progress="hidden",
5344
  ).then(
5345
  generation_code,
5346
+ inputs=[input, image_input, generation_image_input, file_input, website_url_input, setting, history, current_model, search_toggle, language_dropdown, provider_state, image_generation_toggle, image_to_image_toggle, image_to_image_prompt, text_to_image_prompt, image_to_video_toggle, image_to_video_prompt, text_to_video_toggle, text_to_video_prompt, text_to_music_toggle, text_to_music_prompt],
5347
  outputs=[code_output, history, sandbox, history_output]
5348
  ).then(
5349
  end_generation_ui,
 
5395
  upd_t2v_prompt = gr.skip()
5396
  upd_model_dropdown = gr.skip()
5397
  upd_current_model = gr.skip()
5398
+ upd_t2m_toggle = gr.skip()
5399
+ upd_t2m_prompt = gr.skip()
5400
 
5401
  # Split by comma to separate main prompt and directives
5402
  segments = [seg.strip() for seg in (text or "").split(",") if seg.strip()]
 
5462
  if p:
5463
  upd_t2v_prompt = gr.update(value=p)
5464
 
5465
+ # Text-to-music
5466
+ if ("text to music" in seg_norm) or ("text-to-music" in seg_norm) or ("generate music" in seg_norm) or ("compose music" in seg_norm):
5467
+ upd_t2m_toggle = gr.update(value=True)
5468
+ p = after_colon(seg)
5469
+ if p:
5470
+ upd_t2m_prompt = gr.update(value=p)
5471
+
5472
  # URL (website redesign)
5473
  url = _extract_url(seg)
5474
  if url:
 
5533
  upd_t2v_prompt,
5534
  upd_model_dropdown,
5535
  upd_current_model,
5536
+ upd_t2m_toggle,
5537
+ upd_t2m_prompt,
5538
  )
5539
 
5540
  # Wire chat submit -> apply settings -> run generation
 
5560
  text_to_video_prompt,
5561
  model_dropdown,
5562
  current_model,
5563
+ text_to_music_toggle,
5564
+ text_to_music_prompt,
5565
  ],
5566
  queue=False,
5567
  ).then(
 
5571
  show_progress="hidden",
5572
  ).then(
5573
  generation_code,
5574
+ inputs=[input, image_input, generation_image_input, file_input, website_url_input, setting, history, current_model, search_toggle, language_dropdown, provider_state, image_generation_toggle, image_to_image_toggle, image_to_image_prompt, text_to_image_prompt, image_to_video_toggle, image_to_video_prompt, text_to_video_toggle, text_to_video_prompt, text_to_music_toggle, text_to_music_prompt],
5575
  outputs=[code_output, history, sandbox, history_output]
5576
  ).then(
5577
  end_generation_ui,
 
5588
  )
5589
 
5590
  # Toggle between classic controls and beta chat UI
5591
+ def toggle_beta(checked: bool, t2i: bool, i2i: bool, i2v: bool, t2v: bool, t2m: bool):
5592
  # Prompts only visible in classic mode and when their toggles are on
5593
  t2i_vis = (not checked) and bool(t2i)
5594
  i2i_vis = (not checked) and bool(i2i)
5595
  i2v_vis = (not checked) and bool(i2v)
5596
  t2v_vis = (not checked) and bool(t2v)
5597
+ t2m_vis = (not checked) and bool(t2m)
5598
 
5599
  return (
5600
  # Chat UI group
 
5618
  gr.update(visible=i2v_vis), # image_to_video_prompt
5619
  gr.update(visible=not checked), # text_to_video_toggle
5620
  gr.update(visible=t2v_vis), # text_to_video_prompt
5621
+ gr.update(visible=not checked), # text_to_music_toggle
5622
+ gr.update(visible=t2m_vis), # text_to_music_prompt
5623
  gr.update(visible=not checked), # model_dropdown
5624
  gr.update(visible=not checked), # quick_start_md
5625
  gr.update(visible=not checked), # quick_examples_col
 
5627
 
5628
  beta_toggle.change(
5629
  toggle_beta,
5630
+ inputs=[beta_toggle, image_generation_toggle, image_to_image_toggle, image_to_video_toggle, text_to_video_toggle, text_to_music_toggle],
5631
  outputs=[
5632
  sidebar_chatbot,
5633
  sidebar_msg,
 
5648
  image_to_video_prompt,
5649
  text_to_video_toggle,
5650
  text_to_video_prompt,
5651
+ text_to_music_toggle,
5652
+ text_to_music_prompt,
5653
  model_dropdown,
5654
  quick_start_md,
5655
  quick_examples_col,