SephStanek commited on
Commit
a2ff005
Β·
verified Β·
1 Parent(s): 64567d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +437 -179
app.py CHANGED
@@ -2,11 +2,12 @@ import os
2
  import io
3
  import tempfile
4
  import datetime
 
 
 
5
  import torch
6
  import librosa
7
- import numpy as np
8
  import gradio as gr
9
- import asyncio
10
 
11
  from reportlab.pdfgen import canvas
12
  from reportlab.lib.pagesizes import letter
@@ -14,17 +15,24 @@ from reportlab.lib.utils import ImageReader
14
  from reportlab.lib import colors
15
  from reportlab.pdfbase import pdfmetrics
16
  from reportlab.pdfbase.ttfonts import TTFont
17
- from reportlab.platypus import Paragraph
18
- from reportlab.lib.styles import getSampleStyleSheet
19
 
20
  from transformers import (
21
  WhisperProcessor,
22
  AutoModelForSpeechSeq2Seq,
23
  AutoFeatureExtractor,
24
- AutoModel
25
  )
26
  from transformers import pipeline as hf_pipeline
27
 
 
 
 
 
 
 
 
 
 
28
 
29
  # ---------------------------------------------------------
30
  # FONTS
@@ -32,34 +40,33 @@ from transformers import pipeline as hf_pipeline
32
  pdfmetrics.registerFont(TTFont("PlayfairBold", "PlayfairDisplay-Bold.ttf"))
33
  pdfmetrics.registerFont(TTFont("Geneva", "Geneva.ttf"))
34
 
35
-
36
  # ---------------------------------------------------------
37
- # COLORS
38
  # ---------------------------------------------------------
39
- ACCENT = colors.HexColor("#8b5cf6")
40
- PRIMARY = colors.HexColor("#3b0c3f")
41
  LIGHT_GRAY = colors.HexColor("#e6e6e6")
 
42
  WHITE = colors.white
43
  BLACK = colors.black
44
 
45
  ENGINE_URL = "https://www.tourdefierce.vip/ai-music-detector"
 
46
 
47
- LOGO = "logo.jpg" # IMPORTANT: must be uploaded to HF Space root
 
48
 
49
 
50
  # ---------------------------------------------------------
51
- # ML MODELS
52
  # ---------------------------------------------------------
53
- ASR_MODEL = "openai/whisper-small"
54
- CLF_MODEL = "microsoft/wavlm-base-plus-sv"
55
-
56
  processor = WhisperProcessor.from_pretrained(ASR_MODEL)
57
  asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(ASR_MODEL)
58
  asr_pipe = hf_pipeline(
59
  "automatic-speech-recognition",
60
  model=asr_model,
61
  tokenizer=processor.tokenizer,
62
- feature_extractor=processor.feature_extractor
63
  )
64
 
65
  clf_processor = AutoFeatureExtractor.from_pretrained(CLF_MODEL)
@@ -67,25 +74,32 @@ clf_model = AutoModel.from_pretrained(CLF_MODEL)
67
 
68
 
69
  # ---------------------------------------------------------
70
- # DSP FUNCTIONS
71
  # ---------------------------------------------------------
72
  def compute_autotune_index(y, sr):
73
- """Detect pitch correction by measuring log-F0 variance."""
74
  f0, voiced, _ = librosa.pyin(
75
  y,
76
  sr=sr,
77
  fmin=librosa.note_to_hz("C2"),
78
- fmax=librosa.note_to_hz("C6")
79
  )
 
 
 
 
80
  f0 = f0[voiced > 0.5]
81
 
82
  if len(f0) < 10:
83
  return 0.0
84
 
85
- std = np.std(np.log(f0))
 
 
 
86
  max_std = 0.25
87
  score = 1 - np.clip(std / max_std, 0, 1)
88
- return float(score * 100)
89
 
90
 
91
  def extract_embeddings(y, sr):
@@ -95,76 +109,256 @@ def extract_embeddings(y, sr):
95
  return out.cpu().numpy()
96
 
97
 
98
- def calculate_ai_score(emb):
 
 
 
 
 
 
 
 
 
99
  norm = np.linalg.norm(emb)
100
- norm_min, norm_max = 50, 150
101
  norm_scaled = np.clip((norm - norm_min) / (norm_max - norm_min), 0, 1)
102
- return float(np.clip(norm_scaled * 0.9 + 0.1, 0.05, 0.99))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  def detect_key(y, sr):
106
  chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
107
- mean = chroma.mean(axis=1)
108
- idx = np.argmax(mean)
109
- KEYS = ["C","C#","D","Eb","E","F","F#","G","Ab","A","Bb","B"]
 
 
110
 
111
- root = KEYS[idx]
112
- maj_energy = mean[(idx + 4) % 12] + mean[(idx + 7) % 12]
113
- min_energy = mean[(idx + 3) % 12] + mean[(idx + 7) % 12]
114
 
115
- return f"{root} major" if maj_energy > min_energy else f"{root} minor"
116
 
117
 
118
  def detect_bpm(y, sr):
119
- tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
120
- return float(tempo)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
 
123
  # ---------------------------------------------------------
124
- # FORENSIC ANALYSIS GENERATOR
125
  # ---------------------------------------------------------
126
- def generate_forensic_analysis(ai, human, atune, key_sig, bpm, transcript):
127
- out = []
128
- out.append(f"AI likelihood measured at {ai:.1f}%.")
129
- out.append(f"Human likelihood measured at {human:.1f}%.")
130
- out.append(f"Autotune index computed at {atune:.1f}/100.")
131
- out.append(f"Detected key signature: {key_sig}.")
132
- out.append(f"Detected tempo: {bpm:.1f} BPM.")
133
-
134
- if atune > 65:
135
- out.append("Pitch trajectories show unusually low variance, suggesting strong pitch correction.")
136
- elif atune > 35:
137
- out.append("Moderate pitch regularization detected β€” could be gentle EQ or light tuning.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  else:
139
- out.append("Pitch behavior appears natural and uncorrected.")
140
-
141
- if ai > 70:
142
- out.append("Spectral fingerprint aligns strongly with synthetic speech models.")
143
- elif ai > 40:
144
- out.append("Some harmonic structures resemble AI synthesis but confidence is mixed.")
 
 
 
 
 
 
 
 
145
  else:
146
- out.append("Signal characteristics strongly support human vocal production.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- out.append(f"Transcription summary: {transcript[:100]}...")
 
149
 
150
- return "\n".join(out)
151
 
152
 
153
  # ---------------------------------------------------------
154
- # PDF BUILDER
155
  # ---------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def make_pdf(
157
  ai_score,
158
  human_score,
159
  atune,
160
  shade,
161
- analysis,
162
- transcript,
163
  key_sig,
164
  bpm,
165
- clip_title
 
 
 
 
166
  ):
167
-
168
  buffer = io.BytesIO()
169
  c = canvas.Canvas(buffer, pagesize=letter)
170
  W, H = letter
@@ -175,11 +369,11 @@ def make_pdf(
175
 
176
  # Logo
177
  try:
178
- c.drawImage(ImageReader(open(LOGO, "rb")), 40, H - 120, width=90, height=90)
179
- except Exception as e:
180
- print("Logo error:", e)
181
 
182
- # Header
183
  c.setFillColor(PRIMARY)
184
  c.setFont("PlayfairBold", 32)
185
  c.drawString(150, H - 60, "Tour de Fierce")
@@ -188,43 +382,30 @@ def make_pdf(
188
  c.setFont("Geneva", 14)
189
  c.drawString(150, H - 82, "Audio Clapback Reportβ„’")
190
 
191
- # Timestamp
192
  c.setFillColor(BLACK)
193
  c.setFont("Geneva", 10)
194
  c.drawString(150, H - 98, f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}")
195
-
196
- # Clip name
197
  c.setFont("Geneva", 12)
198
- c.drawString(40, H - 150, f"Clip analyzed: {clip_title}")
199
 
200
- # QR Code
201
  try:
202
  import qrcode
 
203
  qr = qrcode.make(ENGINE_URL)
204
  buf = io.BytesIO()
205
  qr.save(buf, format="PNG")
206
  buf.seek(0)
207
  c.drawImage(ImageReader(buf), W - 120, H - 140, width=80, height=80)
208
- except:
209
  pass
210
 
 
211
  c.setStrokeColor(LIGHT_GRAY)
212
  c.line(40, H - 165, W - 40, H - 165)
213
 
214
- # ---------------------------------------------------------
215
- # SCORE BOXES
216
- # ---------------------------------------------------------
217
- def scale_color(val, invert=False):
218
- if invert:
219
- if val <= 25: return colors.green
220
- if val <= 75: return colors.yellow
221
- return colors.red
222
- else:
223
- if val >= 75: return colors.green
224
- if val >= 25: return colors.yellow
225
- return colors.red
226
-
227
- # AI Box
228
  c.setFillColor(scale_color(ai_score, invert=True))
229
  c.rect(40, H - 260, 150, 80, fill=1)
230
  c.setFillColor(WHITE)
@@ -233,113 +414,139 @@ def make_pdf(
233
  c.setFont("PlayfairBold", 26)
234
  c.drawString(55, H - 220, f"{ai_score:.1f}%")
235
 
236
- # Human Box
237
  c.setFillColor(scale_color(human_score))
238
  c.rect(210, H - 260, 150, 80, fill=1)
239
  c.setFillColor(WHITE)
 
240
  c.drawString(225, H - 195, "Human Likelihood")
241
  c.setFont("PlayfairBold", 26)
242
  c.drawString(225, H - 220, f"{human_score:.1f}%")
243
 
244
- # Autotune Box
245
  c.setFillColor(scale_color(atune, invert=True))
246
  c.rect(380, H - 260, 150, 80, fill=1)
247
  c.setFillColor(WHITE)
 
248
  c.drawString(395, H - 195, "Autotune Index")
249
  c.setFont("PlayfairBold", 26)
250
  c.drawString(395, H - 220, f"{atune:.1f}/100")
251
 
252
- # ---------------------------------------------------------
253
- # SHADE METER
254
- # ---------------------------------------------------------
255
  c.setFillColor(BLACK)
256
  c.setFont("Geneva", 12)
257
- c.drawString(40, H - 280, "Shade Meter")
258
 
259
- BAR_Y = H - 300
 
 
 
260
 
261
  c.setFillColor(LIGHT_GRAY)
262
- c.roundRect(40, BAR_Y, 490, 16, 8, fill=1)
263
 
264
- fill_w = (shade / 100) * 490
265
  c.setFillColor(ACCENT)
266
- c.roundRect(40, BAR_Y, fill_w, 16, 8, fill=1)
 
267
 
268
  c.setFillColor(BLACK)
269
  c.setFont("Geneva", 10)
270
- c.drawString(540, BAR_Y + 3, f"{shade:.1f}/100")
271
-
272
- # Shade interpretation
273
- if shade < 10:
274
- sm = "Saint status: no shade detected!"
275
- elif shade < 40:
276
- sm = "A sprinkle of shade β€” but nothing wild."
277
- elif shade < 75:
278
- sm = "Shady lady β€” there are things to discuss."
279
- else:
280
- sm = "Maximum shade parade. AI all up in here."
281
-
282
- c.drawString(40, BAR_Y - 20, sm)
 
 
283
 
284
- # ---------------------------------------------------------
285
- # MUSICALITY
286
- # ---------------------------------------------------------
287
  c.setFont("PlayfairBold", 18)
288
  c.setFillColor(PRIMARY)
289
- c.drawString(40, H - 360, "Musicality Analysis")
 
290
 
291
  c.setFont("Geneva", 11)
292
  c.setFillColor(BLACK)
293
- c.drawString(40, H - 385, f"Key Signature: {key_sig}")
294
- c.drawString(40, H - 402, f"Tempo (BPM): {bpm:.1f}")
 
 
295
 
296
- # ---------------------------------------------------------
297
- # FORENSIC BREAKDOWN
298
- # ---------------------------------------------------------
299
  c.setFont("PlayfairBold", 18)
300
  c.setFillColor(PRIMARY)
301
- c.drawString(40, H - 440, "Forensic Breakdown")
 
302
 
303
  c.setFont("Geneva", 10)
304
- ytxt = H - 465
305
-
306
- for line in analysis.split("\n"):
307
- if ytxt < 80:
308
  c.showPage()
309
- ytxt = H - 60
310
  c.setFont("Geneva", 10)
 
311
  c.drawString(40, ytxt, line)
312
- ytxt -= 14
313
 
314
- # ---------------------------------------------------------
315
- # TRANSCRIPT
316
- # ---------------------------------------------------------
317
  c.setFont("PlayfairBold", 18)
318
  c.setFillColor(PRIMARY)
319
- c.drawString(40, ytxt - 30, "Transcript")
 
 
 
 
 
320
 
321
- ytxt -= 55
322
-
323
- styles = getSampleStyleSheet()
324
- style = styles["Normal"]
325
- style.fontName = "Geneva"
326
- style.fontSize = 10
327
- style.leading = 12
 
 
 
328
 
329
- para = Paragraph(transcript.replace("\n", "<br/>"), style)
330
- pw = W - 80
 
 
 
 
 
 
 
 
331
 
332
- w, h = para.wrap(pw, 400)
333
- para.drawOn(c, 40, ytxt - h)
334
- ytxt -= (h + 20)
 
 
 
 
 
 
 
335
 
336
- # Footer
337
  c.setStrokeColor(LIGHT_GRAY)
338
- c.line(40, 50, W - 40, 50)
339
-
340
  c.setFont("Geneva", 9)
341
- c.drawString(40, 35, "Β© 2025 Tour de Fierce β€” All Shade, No Shame.")
342
- c.drawString(300, 35, "www.tourdefierce.vip")
343
 
344
  c.save()
345
  buffer.seek(0)
@@ -352,80 +559,117 @@ def make_pdf(
352
 
353
 
354
  # ---------------------------------------------------------
355
- # MAIN ANALYSIS
356
  # ---------------------------------------------------------
357
- async def run_analysis(audio_file):
358
  if not audio_file:
359
- return ("No audio file uploaded.", None, None, None, None, None, None, None, None)
360
-
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
362
 
363
  # transcription
364
  try:
365
  text = asr_pipe({"array": y, "sampling_rate": sr})["text"]
366
- except:
367
  text = "[Transcription unavailable]"
368
 
 
 
 
 
369
  emb = extract_embeddings(y, sr)
370
- ai = calculate_ai_score(emb)
371
- human = 1 - ai
372
- atune = compute_autotune_index(y, sr)
373
 
374
- shade = float((ai * 100 + atune + (human * 100)) / 3)
 
375
 
 
376
  key_sig = detect_key(y, sr)
377
  bpm = detect_bpm(y, sr)
 
378
 
379
- suggested = "Given the tessitura, this song is best suited for a high voice (tenor or soprano)."
380
-
381
- analysis = generate_forensic_analysis(ai * 100, human * 100, atune, key_sig, bpm, text)
 
 
 
382
 
383
  clip_title = os.path.basename(audio_file)
384
 
385
  pdf_path = make_pdf(
386
- ai * 100,
387
- human * 100,
388
- atune,
389
  shade,
390
- analysis,
391
- text,
392
  key_sig,
393
  bpm,
394
- clip_title
 
 
 
 
395
  )
396
 
397
  return (
398
  text,
399
- f"{ai*100:.1f}%",
400
- f"{human*100:.1f}%",
401
- f"{atune:.1f}",
402
  f"{shade:.1f}",
403
  key_sig,
404
  f"{bpm:.1f}",
405
- suggested,
 
 
406
  pdf_path,
407
  )
408
 
409
 
410
- # ---------------------------------------------------------
411
  # UI
412
- # ---------------------------------------------------------
413
  with gr.Blocks() as demo:
414
-
415
- gr.HTML("""
416
- <div style='text-align:center; padding:20px;'>
417
- <img src="/file/logo.jpg" style="width:140px; margin-bottom:10px;"/>
418
- <h1 style='font-size:36px; font-weight:800;'>πŸ‘‹ Tour de Fierce Audio Clapback Engineβ„’</h1>
419
- <p style='color:#666;'>AI Detector β€’ Autotune Detector β€’ Key & BPM β€’ Forensic Reporting</p>
420
- </div>
421
- """)
 
 
 
 
422
 
423
  with gr.Row():
424
  audio_in = gr.Audio(type="filepath", label="Upload audio")
425
  run_btn = gr.Button("Run Clapback πŸ‘", variant="primary")
426
 
427
  with gr.Row():
428
- transcript = gr.Textbox(label="Transcript", interactive=False)
 
 
 
 
 
 
 
429
  ai_out = gr.Textbox(label="AI Likelihood", interactive=False)
430
  human_out = gr.Textbox(label="Human Likelihood", interactive=False)
431
  atune_out = gr.Textbox(label="Autotune Index", interactive=False)
@@ -436,10 +680,22 @@ with gr.Blocks() as demo:
436
  bpm_out = gr.Textbox(label="Tempo (BPM)", interactive=False)
437
  voice_out = gr.Textbox(label="Suggested Voice Type", interactive=False)
438
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  pdf_download = gr.File(label="Download Report")
440
 
441
  run_btn.click(
442
- fn=lambda f: asyncio.run(run_analysis(f)),
443
  inputs=audio_in,
444
  outputs=[
445
  transcript,
@@ -450,8 +706,10 @@ with gr.Blocks() as demo:
450
  key_out,
451
  bpm_out,
452
  voice_out,
453
- pdf_download
454
- ]
 
 
455
  )
456
 
457
  demo.launch()
 
2
  import io
3
  import tempfile
4
  import datetime
5
+ import textwrap
6
+
7
+ import numpy as np
8
  import torch
9
  import librosa
 
10
  import gradio as gr
 
11
 
12
  from reportlab.pdfgen import canvas
13
  from reportlab.lib.pagesizes import letter
 
15
  from reportlab.lib import colors
16
  from reportlab.pdfbase import pdfmetrics
17
  from reportlab.pdfbase.ttfonts import TTFont
 
 
18
 
19
  from transformers import (
20
  WhisperProcessor,
21
  AutoModelForSpeechSeq2Seq,
22
  AutoFeatureExtractor,
23
+ AutoModel,
24
  )
25
  from transformers import pipeline as hf_pipeline
26
 
27
+ # --- SciPy / librosa compatibility patch (hann -> windows.hann) ----------
28
+ try:
29
+ import scipy.signal as _sg
30
+ from scipy.signal import windows as _win
31
+
32
+ if not hasattr(_sg, "hann"):
33
+ _sg.hann = _win.hann
34
+ except Exception:
35
+ _sg = None
36
 
37
  # ---------------------------------------------------------
38
  # FONTS
 
40
  pdfmetrics.registerFont(TTFont("PlayfairBold", "PlayfairDisplay-Bold.ttf"))
41
  pdfmetrics.registerFont(TTFont("Geneva", "Geneva.ttf"))
42
 
 
43
  # ---------------------------------------------------------
44
+ # COLORS & CONFIG
45
  # ---------------------------------------------------------
46
+ ACCENT = colors.HexColor("#8b5cf6") # violet accent
47
+ PRIMARY = colors.HexColor("#3b0c3f") # eggplant
48
  LIGHT_GRAY = colors.HexColor("#e6e6e6")
49
+ GOLD = colors.HexColor("#f4c542") # deeper gold for better contrast
50
  WHITE = colors.white
51
  BLACK = colors.black
52
 
53
  ENGINE_URL = "https://www.tourdefierce.vip/ai-music-detector"
54
+ LOGO_FILE = "logo.jpg"
55
 
56
+ ASR_MODEL = "openai/whisper-small" # best free-tier Whisper
57
+ CLF_MODEL = "microsoft/wavlm-base-plus-sv"
58
 
59
 
60
  # ---------------------------------------------------------
61
+ # LOAD MODELS
62
  # ---------------------------------------------------------
 
 
 
63
  processor = WhisperProcessor.from_pretrained(ASR_MODEL)
64
  asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(ASR_MODEL)
65
  asr_pipe = hf_pipeline(
66
  "automatic-speech-recognition",
67
  model=asr_model,
68
  tokenizer=processor.tokenizer,
69
+ feature_extractor=processor.feature_extractor,
70
  )
71
 
72
  clf_processor = AutoFeatureExtractor.from_pretrained(CLF_MODEL)
 
74
 
75
 
76
  # ---------------------------------------------------------
77
+ # DSP / ANALYSIS UTILITIES
78
  # ---------------------------------------------------------
79
  def compute_autotune_index(y, sr):
80
+ """Heuristic autotune index: low pitch variance -> more 'quantized' -> higher score."""
81
  f0, voiced, _ = librosa.pyin(
82
  y,
83
  sr=sr,
84
  fmin=librosa.note_to_hz("C2"),
85
+ fmax=librosa.note_to_hz("C6"),
86
  )
87
+
88
+ if f0 is None:
89
+ return 0.0
90
+
91
  f0 = f0[voiced > 0.5]
92
 
93
  if len(f0) < 10:
94
  return 0.0
95
 
96
+ log_f0 = np.log(f0)
97
+ std = np.std(log_f0)
98
+
99
+ # Very smooth / quantized singing => lower std
100
  max_std = 0.25
101
  score = 1 - np.clip(std / max_std, 0, 1)
102
+ return float(score * 100.0)
103
 
104
 
105
  def extract_embeddings(y, sr):
 
109
  return out.cpu().numpy()
110
 
111
 
112
+ def calculate_ai_probability(emb, y, sr, autotune_idx):
113
+ """
114
+ Heuristic AI probability in [0, 1].
115
+
116
+ Uses:
117
+ - Embedding norm
118
+ - Dynamic range
119
+ - Autotune index
120
+ """
121
+ # Embedding norm (rough style/complexity proxy)
122
  norm = np.linalg.norm(emb)
123
+ norm_min, norm_max = 40, 140
124
  norm_scaled = np.clip((norm - norm_min) / (norm_max - norm_min), 0, 1)
125
+
126
+ # Dynamic range: very flat dynamics can hint at synthetic / over-processed audio
127
+ S = np.abs(librosa.stft(y))
128
+ rms = librosa.feature.rms(S=S)[0]
129
+ dyn_range = np.percentile(rms, 95) - np.percentile(rms, 5)
130
+ dyn_scaled = 1.0 - np.clip((dyn_range - 0.02) / 0.1, 0, 1) # flatter -> closer to 1
131
+
132
+ # Autotune contribution
133
+ at_scaled = autotune_idx / 100.0
134
+
135
+ # Weighted combination
136
+ raw = 0.4 * norm_scaled + 0.3 * dyn_scaled + 0.3 * at_scaled
137
+
138
+ # Squash to [0.05, 0.99] so we never hit absolute 0/100
139
+ ai_prob = float(np.clip(raw * 0.95 + 0.05, 0.05, 0.99))
140
+ return ai_prob
141
 
142
 
143
  def detect_key(y, sr):
144
  chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
145
+ chroma_mean = chroma.mean(axis=1)
146
+ key_index = int(np.argmax(chroma_mean))
147
+
148
+ KEYS = ["C", "C#", "D", "Eb", "E", "F", "F#", "G", "Ab", "A", "Bb", "B"]
149
+ root = KEYS[key_index]
150
 
151
+ maj_energy = chroma_mean[(key_index + 4) % 12] + chroma_mean[(key_index + 7) % 12]
152
+ min_energy = chroma_mean[(key_index + 3) % 12] + chroma_mean[(key_index + 7) % 12]
 
153
 
154
+ return f"{root} major" if maj_energy >= min_energy else f"{root} minor"
155
 
156
 
157
  def detect_bpm(y, sr):
158
+ onset_env = librosa.onset.onset_strength(y=y, sr=sr)
159
+ tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
160
+ if tempo is None or len(tempo) == 0:
161
+ return 0.0
162
+ return float(tempo[0])
163
+
164
+
165
+ def estimate_voice_type(y, sr):
166
+ """Very rough tessitura-based suggestion."""
167
+ f0, voiced, _ = librosa.pyin(
168
+ y,
169
+ sr=sr,
170
+ fmin=librosa.note_to_hz("C2"),
171
+ fmax=librosa.note_to_hz("C6"),
172
+ )
173
+
174
+ if f0 is None or np.sum(voiced) < 5:
175
+ return "Unable to estimate voice type from this clip."
176
+
177
+ f0 = f0[voiced > 0.5]
178
+ median_hz = np.median(f0)
179
+ median_note = librosa.hz_to_note(median_hz)
180
+
181
+ # Very coarse buckets
182
+ if median_hz < librosa.note_to_hz("G3"):
183
+ base = "lower voice (baritone / alto range)"
184
+ elif median_hz < librosa.note_to_hz("C4"):
185
+ base = "mid voice (baritenor / mezzo range)"
186
+ else:
187
+ base = "high voice (tenor or soprano range)"
188
+
189
+ return f"Given the tessitura, this song is best suited for a {base}."
190
+
191
+
192
+ def compute_production_polish(y, sr):
193
+ """0-100: how polished / produced the track sounds."""
194
+ S = np.abs(librosa.stft(y))
195
+ rms = librosa.feature.rms(S=S)[0]
196
+
197
+ dyn_range = np.percentile(rms, 95) - np.percentile(rms, 5)
198
+ dyn_score = 1.0 - np.clip((dyn_range - 0.015) / 0.12, 0, 1)
199
+
200
+ flatness = np.mean(librosa.feature.spectral_flatness(S=S))
201
+ flat_score = np.clip((flatness - 0.1) / 0.4, 0, 1)
202
+
203
+ polish = 0.6 * dyn_score + 0.4 * flat_score
204
+ return float(polish * 100.0)
205
+
206
+
207
+ def compute_shade_score(ai_percent, autotune_idx, polish_idx):
208
+ """
209
+ Shade Meter 0–100:
210
+ - 60% AI likelihood
211
+ - 25% autotune index
212
+ - 15% production polish
213
+ """
214
+ shade = 0.6 * ai_percent + 0.25 * autotune_idx + 0.15 * polish_idx
215
+ return float(np.clip(shade, 0, 100))
216
 
217
 
218
  # ---------------------------------------------------------
219
+ # TEXT HELPERS
220
  # ---------------------------------------------------------
221
+ def wrap_paragraph(text, width=90):
222
+ lines = []
223
+ for para in text.splitlines():
224
+ if not para.strip():
225
+ lines.append("")
226
+ continue
227
+ lines.extend(textwrap.wrap(para, width=width))
228
+ return lines
229
+
230
+
231
+ def build_scientific_analysis(ai_pct, human_pct, autotune_idx, shade, key_sig, bpm, polish_idx):
232
+ lines = []
233
+ lines.append("Overview")
234
+ lines.append(
235
+ f"This clip was analyzed using a hybrid signal-processing and deep-learning stack. "
236
+ f"Based on embedding statistics, dynamic range, spectral behavior, and pitch stability, "
237
+ f"the system estimates a {ai_pct:.1f}% probability that the source material is AI-generated, "
238
+ f"and a {human_pct:.1f}% probability that it is primarily human-performed."
239
+ )
240
+ lines.append("")
241
+ lines.append("Pitch & Autotune")
242
+ lines.append(
243
+ f"Fundamental frequency tracking suggests an autotune index of {autotune_idx:.1f}/100. "
244
+ f"Lower scores indicate more organic pitch variance, while higher scores indicate quantized or "
245
+ f"grid-snapped intonation."
246
+ )
247
+ lines.append("")
248
+ lines.append("Rhythm & Tempo")
249
+ lines.append(
250
+ f"Tempo estimation places this performance at approximately {bpm:.1f} beats per minute. "
251
+ f"The detected tempo is derived from onset strength peaks and may vary slightly with different sections "
252
+ f"of the recording."
253
+ )
254
+ lines.append("")
255
+ lines.append("Timbre & Production")
256
+ lines.append(
257
+ f"Timbre and dynamics analysis yields a production polish score of {polish_idx:.1f}/100. "
258
+ f"Higher scores correspond to compressed, consistently loud, and spectrally uniform material, "
259
+ f"often associated with heavily produced or synthetic audio."
260
+ )
261
+ lines.append("")
262
+ lines.append("Musical Context")
263
+ lines.append(
264
+ f"Harmonic analysis indicates that the material centers around {key_sig}. "
265
+ f"This key estimate is based on chroma energy distribution over the length of the clip."
266
+ )
267
+ return "\n".join(lines)
268
+
269
+
270
+ def build_clapback(ai_pct, human_pct, autotune_idx, shade, key_sig, bpm, voice_text):
271
+ tone_lines = []
272
+ tone_lines.append("CLAPBACK SUMMARY")
273
+ tone_lines.append("")
274
+ if ai_pct >= 75:
275
+ tone_lines.append(
276
+ f"This track is giving **full robot fantasy** with an AI likelihood of {ai_pct:.1f}%. "
277
+ f"If there was a human involved, they were probably just pressing 'render.'"
278
+ )
279
+ elif ai_pct >= 40:
280
+ tone_lines.append(
281
+ f"This performance lives in the uncanny valley with an AI likelihood of {ai_pct:.1f}%. "
282
+ f"Some human in there, but the machines are definitely helping."
283
+ )
284
  else:
285
+ tone_lines.append(
286
+ f"With only {ai_pct:.1f}% AI likelihood, this one is serving mostly human realness. "
287
+ f"Congrats: your soul is still in the mix."
288
+ )
289
+
290
+ tone_lines.append("")
291
+ if autotune_idx >= 70:
292
+ tone_lines.append(
293
+ f"Autotune index {autotune_idx:.1f}/100: every note is so locked to the grid it should pay rent there."
294
+ )
295
+ elif autotune_idx >= 35:
296
+ tone_lines.append(
297
+ f"Autotune index {autotune_idx:.1f}/100: tasteful correction, but we definitely hear the safety net."
298
+ )
299
  else:
300
+ tone_lines.append(
301
+ f"Autotune index {autotune_idx:.1f}/100: pitch is flying mostly solo β€” brave, messy, and very human."
302
+ )
303
+
304
+ tone_lines.append("")
305
+ tone_lines.append(
306
+ f"Shade Meter score: {shade:.1f}/100. "
307
+ f"Zero would mean unplugged, unprocessed, angel-on-a-stool vibes. "
308
+ f"You're sitting at {shade:.1f}, which means there's at least a mild breeze of manufactured perfection "
309
+ f"blowing through this mix."
310
+ )
311
+
312
+ tone_lines.append("")
313
+ tone_lines.append(
314
+ f"Musically, the track hangs out around {key_sig} at about {bpm:.1f} BPM, so if you’re clapping back on TikTok, "
315
+ f"now you know what tempo to drag them in."
316
+ )
317
 
318
+ tone_lines.append("")
319
+ tone_lines.append(f"Voice-tessitura take: {voice_text}")
320
 
321
+ return "\n".join(tone_lines)
322
 
323
 
324
  # ---------------------------------------------------------
325
+ # PDF GENERATION
326
  # ---------------------------------------------------------
327
+ def scale_color(val, invert=False):
328
+ """
329
+ For score boxes:
330
+ - green: good
331
+ - gold: medium
332
+ - red: high risk
333
+ """
334
+ if invert:
335
+ # invert: low is good
336
+ if val <= 25:
337
+ return colors.green
338
+ if val <= 75:
339
+ return GOLD
340
+ return colors.red
341
+ else:
342
+ if val >= 75:
343
+ return colors.green
344
+ if val >= 25:
345
+ return GOLD
346
+ return colors.red
347
+
348
+
349
  def make_pdf(
350
  ai_score,
351
  human_score,
352
  atune,
353
  shade,
 
 
354
  key_sig,
355
  bpm,
356
+ transcript,
357
+ scientific_text,
358
+ clapback_text,
359
+ clip_title,
360
+ polish_idx,
361
  ):
 
362
  buffer = io.BytesIO()
363
  c = canvas.Canvas(buffer, pagesize=letter)
364
  W, H = letter
 
369
 
370
  # Logo
371
  try:
372
+ c.drawImage(LOGO_FILE, 40, H - 120, width=90, height=90)
373
+ except Exception:
374
+ pass
375
 
376
+ # Branding
377
  c.setFillColor(PRIMARY)
378
  c.setFont("PlayfairBold", 32)
379
  c.drawString(150, H - 60, "Tour de Fierce")
 
382
  c.setFont("Geneva", 14)
383
  c.drawString(150, H - 82, "Audio Clapback Reportβ„’")
384
 
385
+ # Timestamp & clip
386
  c.setFillColor(BLACK)
387
  c.setFont("Geneva", 10)
388
  c.drawString(150, H - 98, f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}")
 
 
389
  c.setFont("Geneva", 12)
390
+ c.drawString(40, H - 145, f"Clip analyzed: {clip_title}")
391
 
392
+ # QR to engine
393
  try:
394
  import qrcode
395
+
396
  qr = qrcode.make(ENGINE_URL)
397
  buf = io.BytesIO()
398
  qr.save(buf, format="PNG")
399
  buf.seek(0)
400
  c.drawImage(ImageReader(buf), W - 120, H - 140, width=80, height=80)
401
+ except Exception:
402
  pass
403
 
404
+ # Divider line
405
  c.setStrokeColor(LIGHT_GRAY)
406
  c.line(40, H - 165, W - 40, H - 165)
407
 
408
+ # ---------------------- SCORE BOXES ----------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  c.setFillColor(scale_color(ai_score, invert=True))
410
  c.rect(40, H - 260, 150, 80, fill=1)
411
  c.setFillColor(WHITE)
 
414
  c.setFont("PlayfairBold", 26)
415
  c.drawString(55, H - 220, f"{ai_score:.1f}%")
416
 
 
417
  c.setFillColor(scale_color(human_score))
418
  c.rect(210, H - 260, 150, 80, fill=1)
419
  c.setFillColor(WHITE)
420
+ c.setFont("Geneva", 11)
421
  c.drawString(225, H - 195, "Human Likelihood")
422
  c.setFont("PlayfairBold", 26)
423
  c.drawString(225, H - 220, f"{human_score:.1f}%")
424
 
 
425
  c.setFillColor(scale_color(atune, invert=True))
426
  c.rect(380, H - 260, 150, 80, fill=1)
427
  c.setFillColor(WHITE)
428
+ c.setFont("Geneva", 11)
429
  c.drawString(395, H - 195, "Autotune Index")
430
  c.setFont("PlayfairBold", 26)
431
  c.drawString(395, H - 220, f"{atune:.1f}/100")
432
 
433
+ # ---------------------- SHADE METER ----------------------
 
 
434
  c.setFillColor(BLACK)
435
  c.setFont("Geneva", 12)
436
+ c.drawString(40, H - 295, "Shade Meter")
437
 
438
+ # capsule bar background (below the title so it doesn't overlap)
439
+ bar_y = H - 310
440
+ bar_height = 14
441
+ bar_width = 490
442
 
443
  c.setFillColor(LIGHT_GRAY)
444
+ c.roundRect(40, bar_y, bar_width, bar_height, 7, fill=1)
445
 
446
+ # fill proportional to shade score
447
  c.setFillColor(ACCENT)
448
+ fill_w = (shade / 100.0) * bar_width
449
+ c.roundRect(40, bar_y, fill_w, bar_height, 7, fill=1)
450
 
451
  c.setFillColor(BLACK)
452
  c.setFont("Geneva", 10)
453
+ c.drawString(540, bar_y + 1, f"{shade:.1f}/100")
454
+
455
+ # explanatory blurb
456
+ shade_blurb = (
457
+ "The Shade Meter provides a comprehensive analysis of the uploaded file, representing exactly "
458
+ "how much shade you are entitled to direct toward the source of the clip. The ideal score is 0%, "
459
+ "indicating real, acoustic instruments and un-pitch-corrected vocals. Moderate scores may reflect "
460
+ "MIDI instruments or noticeably processed vocals. A 100 is the ultimate shade parade, with 100% "
461
+ "confidence that the clip was generated by an AI system."
462
+ )
463
+ c.setFont("Geneva", 9)
464
+ ytxt = H - 330
465
+ for line in wrap_paragraph(shade_blurb, width=95):
466
+ c.drawString(40, ytxt, line)
467
+ ytxt -= 11
468
 
469
+ # ---------------------- MUSICALITY -----------------------
470
+ ytxt -= 5
 
471
  c.setFont("PlayfairBold", 18)
472
  c.setFillColor(PRIMARY)
473
+ c.drawString(40, ytxt, "Musicality Analysis")
474
+ ytxt -= 18
475
 
476
  c.setFont("Geneva", 11)
477
  c.setFillColor(BLACK)
478
+ c.drawString(40, ytxt, f"Key Signature: {key_sig}")
479
+ ytxt -= 14
480
+ c.drawString(40, ytxt, f"Tempo (BPM): {bpm:.1f}")
481
+ ytxt -= 20
482
 
483
+ # ----------------- TECHNICAL FORENSIC ANALYSIS -----------------
 
 
484
  c.setFont("PlayfairBold", 18)
485
  c.setFillColor(PRIMARY)
486
+ c.drawString(40, ytxt, "Technical Forensic Analysis")
487
+ ytxt -= 18
488
 
489
  c.setFont("Geneva", 10)
490
+ c.setFillColor(BLACK)
491
+ for line in wrap_paragraph(scientific_text, width=95):
492
+ if ytxt < 60:
 
493
  c.showPage()
494
+ W2, H2 = letter
495
  c.setFont("Geneva", 10)
496
+ ytxt = H2 - 60
497
  c.drawString(40, ytxt, line)
498
+ ytxt -= 11
499
 
500
+ # ----------------- CLAPBACK SECTION -----------------
501
+ ytxt -= 10
 
502
  c.setFont("PlayfairBold", 18)
503
  c.setFillColor(PRIMARY)
504
+ if ytxt < 60:
505
+ c.showPage()
506
+ W2, H2 = letter
507
+ ytxt = H2 - 60
508
+ c.drawString(40, ytxt, "Clapback Shade Report")
509
+ ytxt -= 18
510
 
511
+ c.setFont("Geneva", 10)
512
+ c.setFillColor(BLACK)
513
+ for line in wrap_paragraph(clapback_text, width=95):
514
+ if ytxt < 60:
515
+ c.showPage()
516
+ W2, H2 = letter
517
+ c.setFont("Geneva", 10)
518
+ ytxt = H2 - 60
519
+ c.drawString(40, ytxt, line)
520
+ ytxt -= 11
521
 
522
+ # ----------------- TRANSCRIPT -----------------
523
+ ytxt -= 10
524
+ c.setFont("PlayfairBold", 18)
525
+ c.setFillColor(PRIMARY)
526
+ if ytxt < 60:
527
+ c.showPage()
528
+ W2, H2 = letter
529
+ ytxt = H2 - 60
530
+ c.drawString(40, ytxt, "Transcript")
531
+ ytxt -= 18
532
 
533
+ c.setFont("Geneva", 9)
534
+ c.setFillColor(BLACK)
535
+ for line in wrap_paragraph(transcript, width=100):
536
+ if ytxt < 50:
537
+ c.showPage()
538
+ W2, H2 = letter
539
+ c.setFont("Geneva", 9)
540
+ ytxt = H2 - 60
541
+ c.drawString(40, ytxt, line)
542
+ ytxt -= 10
543
 
544
+ # footer on last page
545
  c.setStrokeColor(LIGHT_GRAY)
546
+ c.line(40, 40, W - 40, 40)
 
547
  c.setFont("Geneva", 9)
548
+ c.drawString(40, 28, "Β© 2025 Tour de Fierce β€” All Shade, No Shame.")
549
+ c.drawString(300, 28, "www.tourdefierce.vip")
550
 
551
  c.save()
552
  buffer.seek(0)
 
559
 
560
 
561
  # ---------------------------------------------------------
562
+ # MAIN ANALYSIS PIPELINE
563
  # ---------------------------------------------------------
564
+ def run_analysis(audio_file):
565
  if not audio_file:
566
+ return (
567
+ "No audio file uploaded.",
568
+ "",
569
+ "",
570
+ "",
571
+ "",
572
+ "",
573
+ "",
574
+ "",
575
+ "",
576
+ "",
577
+ None,
578
+ )
579
+
580
+ # load
581
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
582
 
583
  # transcription
584
  try:
585
  text = asr_pipe({"array": y, "sampling_rate": sr})["text"]
586
+ except Exception:
587
  text = "[Transcription unavailable]"
588
 
589
+ # core metrics
590
+ autotune_idx = compute_autotune_index(y, sr)
591
+ polish_idx = compute_production_polish(y, sr)
592
+
593
  emb = extract_embeddings(y, sr)
594
+ ai_prob = calculate_ai_probability(emb, y, sr, autotune_idx)
595
+ human_prob = 1.0 - ai_prob
 
596
 
597
+ ai_pct = ai_prob * 100.0
598
+ human_pct = human_prob * 100.0
599
 
600
+ shade = compute_shade_score(ai_pct, autotune_idx, polish_idx)
601
  key_sig = detect_key(y, sr)
602
  bpm = detect_bpm(y, sr)
603
+ voice_text = estimate_voice_type(y, sr)
604
 
605
+ scientific_text = build_scientific_analysis(
606
+ ai_pct, human_pct, autotune_idx, shade, key_sig, bpm, polish_idx
607
+ )
608
+ clapback_text = build_clapback(
609
+ ai_pct, human_pct, autotune_idx, shade, key_sig, bpm, voice_text
610
+ )
611
 
612
  clip_title = os.path.basename(audio_file)
613
 
614
  pdf_path = make_pdf(
615
+ ai_pct,
616
+ human_pct,
617
+ autotune_idx,
618
  shade,
 
 
619
  key_sig,
620
  bpm,
621
+ text,
622
+ scientific_text,
623
+ clapback_text,
624
+ clip_title,
625
+ polish_idx,
626
  )
627
 
628
  return (
629
  text,
630
+ f"{ai_pct:.1f}%",
631
+ f"{human_pct:.1f}%",
632
+ f"{autotune_idx:.1f}",
633
  f"{shade:.1f}",
634
  key_sig,
635
  f"{bpm:.1f}",
636
+ voice_text,
637
+ scientific_text,
638
+ clapback_text,
639
  pdf_path,
640
  )
641
 
642
 
643
+ # --------------------------------------------------------------
644
  # UI
645
+ # --------------------------------------------------------------
646
  with gr.Blocks() as demo:
647
+ gr.HTML(
648
+ """
649
+ <div style='text-align:center; padding:20px;'>
650
+ <h1 style='font-size:36px; font-weight:800;'>
651
+ πŸ‘‹ Tour de Fierce Audio Clapback Engineβ„’
652
+ </h1>
653
+ <p style='color:#ccc;'>
654
+ AI Detector β€’ Autotune Detector β€’ Key & BPM β€’ Forensic Reporting
655
+ </p>
656
+ </div>
657
+ """
658
+ )
659
 
660
  with gr.Row():
661
  audio_in = gr.Audio(type="filepath", label="Upload audio")
662
  run_btn = gr.Button("Run Clapback πŸ‘", variant="primary")
663
 
664
  with gr.Row():
665
+ transcript = gr.Textbox(
666
+ label="Transcript",
667
+ interactive=False,
668
+ lines=5,
669
+ show_label=True,
670
+ )
671
+
672
+ with gr.Row():
673
  ai_out = gr.Textbox(label="AI Likelihood", interactive=False)
674
  human_out = gr.Textbox(label="Human Likelihood", interactive=False)
675
  atune_out = gr.Textbox(label="Autotune Index", interactive=False)
 
680
  bpm_out = gr.Textbox(label="Tempo (BPM)", interactive=False)
681
  voice_out = gr.Textbox(label="Suggested Voice Type", interactive=False)
682
 
683
+ with gr.Row():
684
+ forensic_out = gr.Textbox(
685
+ label="Technical Forensic Analysis",
686
+ interactive=False,
687
+ lines=12,
688
+ )
689
+ clapback_out = gr.Textbox(
690
+ label="Clapback Shade Report",
691
+ interactive=False,
692
+ lines=12,
693
+ )
694
+
695
  pdf_download = gr.File(label="Download Report")
696
 
697
  run_btn.click(
698
+ fn=run_analysis,
699
  inputs=audio_in,
700
  outputs=[
701
  transcript,
 
706
  key_out,
707
  bpm_out,
708
  voice_out,
709
+ forensic_out,
710
+ clapback_out,
711
+ pdf_download,
712
+ ],
713
  )
714
 
715
  demo.launch()