Karan0310's picture
Update app.py
8a74936 verified
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline, AutoTokenizer
import torch
import re
import gradio as gr
import requests
import os
# -------------------
# CONFIGURATION
# -------------------
model_path = "facebook/bart-large-cnn"
max_tokens = 1024
YOUTUBE_API_KEY = "AIzaSyDCY9RM085oTxC8oms5z9TPzPKXKLFQAgc" # Replace with your real API key
# -------------------
# Load model
# -------------------
tokenizer = AutoTokenizer.from_pretrained(model_path)
text_summary = pipeline(
task="summarization",
model=model_path,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
)
from pydantic import BaseModel, PydanticUserError, ConfigDict
from pydantic import BaseModel, ConfigDict
class MyModel(BaseModel):
request: 'starlette.requests.Request'
model_config = ConfigDict(arbitrary_types_allowed=True)
from pydantic_core import core_schema
from starlette.requests import Request
def get_pydantic_core_schema(request_type, handler):
return core_schema.any_schema()
Request.__get_pydantic_core_schema__ = get_pydantic_core_schema
# -------------------
# Utilities
# -------------------
def extract_video_id(url):
regex = r"(?:v=|\/)([0-9A-Za-z_-]{11})"
match = re.search(regex, url)
if match:
return match.group(1)
else:
raise ValueError("Invalid YouTube URL")
def fetch_video_metadata(video_id):
try:
url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={YOUTUBE_API_KEY}"
response = requests.get(url)
data = response.json()
if "items" in data and data["items"]:
title = data["items"][0]["snippet"]["title"]
description = data["items"][0]["snippet"]["description"]
return title, description
else:
return "Title Not Found", "Description Not Found"
except Exception as e:
return "Error fetching title", str(e)
def get_transcript_text(video_id):
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
full_text = " ".join([entry['text'] for entry in transcript])
return full_text
except Exception as e:
return None
def split_into_chunks(text, tokenizer, max_tokens):
words = text.split()
chunks = []
current_chunk = []
current_len = 0
for word in words:
token_len = len(tokenizer.tokenize(word))
if current_len + token_len > max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = [word]
current_len = token_len
else:
current_chunk.append(word)
current_len += token_len
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def summarize_text(full_text):
if not full_text.strip():
return "Transcript is empty or could not be retrieved."
chunks = split_into_chunks(full_text, tokenizer, max_tokens)
summaries = []
for i, chunk in enumerate(chunks):
try:
print(f"Summarizing chunk {i+1}/{len(chunks)}...")
summary = text_summary(chunk, max_length=180, min_length=10, do_sample=False)
summaries.append(summary[0]['summary_text'])
except Exception as e:
summaries.append(f"[Error summarizing chunk: {str(e)}]")
return "\n\n".join(summaries)
# -------------------
# Main Summary Function
# -------------------
def summarize_youtube_video(url):
try:
video_id = extract_video_id(url)
title, description = fetch_video_metadata(video_id)
transcript = get_transcript_text(video_id)
if not transcript:
return f"**Title**: {title}\n\n**Transcript not available.**"
summary = summarize_text(transcript)
return f"**Title**: {title}\n\n**Summary**:\n{summary}"
except Exception as e:
return f"Error: {str(e)}"
# -------------------
# Gradio UI
# -------------------
demo = gr.Interface(
fn=summarize_youtube_video,
inputs=[gr.Textbox(label='Enter YouTube URL')],
outputs=[gr.Textbox(label='Video Title and Summary', lines=15)],
title='YouTube Video Summarizer with Metadata',
description='Paste a YouTube video URL to get a title and summarized content using transcript + YouTube API'
)
if __name__ == "__main__":
demo.launch()