Spaces:
Running
Running
from youtube_transcript_api import YouTubeTranscriptApi | |
from transformers import pipeline, AutoTokenizer | |
import torch | |
import re | |
import gradio as gr | |
import requests | |
import os | |
# ------------------- | |
# CONFIGURATION | |
# ------------------- | |
model_path = "facebook/bart-large-cnn" | |
max_tokens = 1024 | |
YOUTUBE_API_KEY = "AIzaSyDCY9RM085oTxC8oms5z9TPzPKXKLFQAgc" # Replace with your real API key | |
# ------------------- | |
# Load model | |
# ------------------- | |
tokenizer = AutoTokenizer.from_pretrained(model_path) | |
text_summary = pipeline( | |
task="summarization", | |
model=model_path, | |
tokenizer=tokenizer, | |
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32 | |
) | |
from pydantic import BaseModel, PydanticUserError, ConfigDict | |
from pydantic import BaseModel, ConfigDict | |
class MyModel(BaseModel): | |
request: 'starlette.requests.Request' | |
model_config = ConfigDict(arbitrary_types_allowed=True) | |
from pydantic_core import core_schema | |
from starlette.requests import Request | |
def get_pydantic_core_schema(request_type, handler): | |
return core_schema.any_schema() | |
Request.__get_pydantic_core_schema__ = get_pydantic_core_schema | |
# ------------------- | |
# Utilities | |
# ------------------- | |
def extract_video_id(url): | |
regex = r"(?:v=|\/)([0-9A-Za-z_-]{11})" | |
match = re.search(regex, url) | |
if match: | |
return match.group(1) | |
else: | |
raise ValueError("Invalid YouTube URL") | |
def fetch_video_metadata(video_id): | |
try: | |
url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={YOUTUBE_API_KEY}" | |
response = requests.get(url) | |
data = response.json() | |
if "items" in data and data["items"]: | |
title = data["items"][0]["snippet"]["title"] | |
description = data["items"][0]["snippet"]["description"] | |
return title, description | |
else: | |
return "Title Not Found", "Description Not Found" | |
except Exception as e: | |
return "Error fetching title", str(e) | |
def get_transcript_text(video_id): | |
try: | |
transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
full_text = " ".join([entry['text'] for entry in transcript]) | |
return full_text | |
except Exception as e: | |
return None | |
def split_into_chunks(text, tokenizer, max_tokens): | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_len = 0 | |
for word in words: | |
token_len = len(tokenizer.tokenize(word)) | |
if current_len + token_len > max_tokens: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = [word] | |
current_len = token_len | |
else: | |
current_chunk.append(word) | |
current_len += token_len | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
return chunks | |
def summarize_text(full_text): | |
if not full_text.strip(): | |
return "Transcript is empty or could not be retrieved." | |
chunks = split_into_chunks(full_text, tokenizer, max_tokens) | |
summaries = [] | |
for i, chunk in enumerate(chunks): | |
try: | |
print(f"Summarizing chunk {i+1}/{len(chunks)}...") | |
summary = text_summary(chunk, max_length=180, min_length=10, do_sample=False) | |
summaries.append(summary[0]['summary_text']) | |
except Exception as e: | |
summaries.append(f"[Error summarizing chunk: {str(e)}]") | |
return "\n\n".join(summaries) | |
# ------------------- | |
# Main Summary Function | |
# ------------------- | |
def summarize_youtube_video(url): | |
try: | |
video_id = extract_video_id(url) | |
title, description = fetch_video_metadata(video_id) | |
transcript = get_transcript_text(video_id) | |
if not transcript: | |
return f"**Title**: {title}\n\n**Transcript not available.**" | |
summary = summarize_text(transcript) | |
return f"**Title**: {title}\n\n**Summary**:\n{summary}" | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# ------------------- | |
# Gradio UI | |
# ------------------- | |
demo = gr.Interface( | |
fn=summarize_youtube_video, | |
inputs=[gr.Textbox(label='Enter YouTube URL')], | |
outputs=[gr.Textbox(label='Video Title and Summary', lines=15)], | |
title='YouTube Video Summarizer with Metadata', | |
description='Paste a YouTube video URL to get a title and summarized content using transcript + YouTube API' | |
) | |
if __name__ == "__main__": | |
demo.launch() | |