Spaces:
Running
Running
import gradio as gr | |
import nltk | |
import numpy as np | |
import networkx as nx | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from nltk.corpus import stopwords | |
import string | |
from transformers import BartForConditionalGeneration, BartTokenizer | |
import requests | |
from PyPDF2 import PdfReader | |
from bs4 import BeautifulSoup | |
import fitz # PyMuPDF | |
import docx | |
from PIL import Image | |
import pytesseract | |
from youtube_transcript_api import YouTubeTranscriptApi | |
# Download required NLTK data files | |
nltk.download('all') | |
# Load pre-trained BART model and tokenizer | |
model_name = "facebook/bart-large-cnn" | |
tokenizer = BartTokenizer.from_pretrained(model_name) | |
model = BartForConditionalGeneration.from_pretrained(model_name) | |
def preprocess_text(text): | |
sentences = sent_tokenize(text) | |
stop_words = set(stopwords.words('english')) | |
preprocessed_sentences = [] | |
for sentence in sentences: | |
words = word_tokenize(sentence.lower()) | |
filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation] | |
preprocessed_sentences.append(' '.join(filtered_words)) | |
return sentences, preprocessed_sentences | |
def build_similarity_matrix(sentences): | |
tfidf_vectorizer = TfidfVectorizer() | |
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) | |
similarity_matrix = cosine_similarity(tfidf_matrix) | |
return similarity_matrix | |
def textrank_summary(text, num_sentences=5): | |
original_sentences, preprocessed_sentences = preprocess_text(text) | |
similarity_matrix = build_similarity_matrix(preprocessed_sentences) | |
similarity_graph = nx.from_numpy_array(similarity_matrix) | |
scores = nx.pagerank(similarity_graph) | |
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True) | |
summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]]) | |
return summary | |
def tfidf_summary(text, num_sentences=5): | |
original_sentences, preprocessed_sentences = preprocess_text(text) | |
tfidf_vectorizer = TfidfVectorizer() | |
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences) | |
sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten() | |
ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]] | |
summary = ' '.join(ranked_sentences[:num_sentences]) | |
return summary | |
def bart_summary(text): | |
inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True) | |
summary_ids = model.generate( | |
inputs["input_ids"], | |
max_length=1000, | |
min_length=50, | |
num_beams=4, | |
early_stopping=True | |
) | |
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
return summary | |
def extract_text_from_url(url): | |
try: | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
paragraphs = soup.find_all('p') | |
text = ' '.join([para.get_text() for para in paragraphs]) | |
return text | |
except Exception as e: | |
return f"Error fetching link: {e}" | |
def extract_text_from_pdf(pdf_path): | |
try: | |
document = fitz.open(pdf_path) | |
text = '' | |
for page in document: | |
text += page.get_text() | |
return text | |
except Exception as e: | |
return f"Error reading PDF: {e}" | |
def extract_text_from_docx(docx_path): | |
try: | |
document = docx.Document(docx_path) | |
text = ' '.join([para.text for para in document.paragraphs]) | |
return text | |
except Exception as e: | |
return f"Error reading DOCX: {e}" | |
def extract_text_from_file(file): | |
file_extension = file.name.split('.')[-1].lower() | |
text = '' | |
if file_extension == 'pdf': | |
pdf_reader = PdfReader(file.name) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
elif file_extension == 'docx': | |
doc = docx.Document(file.name) | |
text = ' '.join([para.text for para in doc.paragraphs]) | |
elif file_extension in ('png', 'jpg', 'jpeg'): | |
image = Image.open(file.name) | |
text = pytesseract.image_to_string(image) | |
elif file_extension == 'txt': | |
with open(file.name, 'r', encoding='utf-8') as f: | |
text = f.read() | |
return text | |
def extract_text_from_youtube(url): | |
try: | |
if "youtube.com" in url: | |
video_id = url.split('v=')[1].split('&')[0] | |
elif "youtu.be" in url: | |
video_id = url.split('/')[-1] | |
else: | |
return "Invalid YouTube URL" | |
transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
text = ' '.join([item['text'] for item in transcript]) | |
return text | |
except Exception as e: | |
return f"Error fetching YouTube transcript: {e}" | |
def summarize_text(text, file, link, youtube_link, method): | |
input_text = "" | |
if text: | |
input_text = text | |
elif file: | |
input_text = extract_text_from_file(file) | |
elif link: | |
input_text = extract_text_from_url(link) | |
elif youtube_link: | |
input_text = extract_text_from_youtube(youtube_link) | |
if "Error" in input_text: | |
return input_text | |
if method == "TF-IDF": | |
return tfidf_summary(input_text) | |
elif method == "TextRank": | |
return textrank_summary(input_text) | |
elif method == "Abstractive": | |
return bart_summary(input_text) | |
# Create a Gradio interface | |
interface = gr.Interface( | |
fn=summarize_text, | |
inputs=[ | |
gr.Textbox( | |
lines=8, | |
placeholder="Paste your text here...", | |
label="Input Text" | |
), | |
gr.File( | |
label="Upload PDF, DOCX, JPG, PNG, JPEG, or Text Files" | |
), | |
gr.Textbox( | |
lines=1, | |
placeholder="Enter URL here...", | |
label="Input Link" | |
), | |
gr.Textbox( | |
lines=1, | |
placeholder="Enter YouTube video URL here...", | |
label="Input YouTube Link" | |
), | |
gr.Radio( | |
choices=["TF-IDF", "TextRank", "Abstractive"], | |
label="Summarization Method", | |
value="Abstractive" | |
) | |
], | |
outputs=gr.Textbox( | |
lines=15, | |
label="Concise Summary" | |
), | |
title="Text Summarizer", | |
description="Get a clear and concise summary of your text!", | |
theme="default", | |
) | |
# Launch the interface | |
interface.launch(share=True) | |