Nikhil2411's picture
Update app.py
e43fe26 verified
import gradio as gr
import nltk
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from transformers import BartForConditionalGeneration, BartTokenizer
import requests
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
import fitz # PyMuPDF
import docx
from PIL import Image
import pytesseract
from youtube_transcript_api import YouTubeTranscriptApi
# Download required NLTK data files
nltk.download('all')
# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
def preprocess_text(text):
sentences = sent_tokenize(text)
stop_words = set(stopwords.words('english'))
preprocessed_sentences = []
for sentence in sentences:
words = word_tokenize(sentence.lower())
filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
preprocessed_sentences.append(' '.join(filtered_words))
return sentences, preprocessed_sentences
def build_similarity_matrix(sentences):
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(tfidf_matrix)
return similarity_matrix
def textrank_summary(text, num_sentences=5):
original_sentences, preprocessed_sentences = preprocess_text(text)
similarity_matrix = build_similarity_matrix(preprocessed_sentences)
similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(similarity_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])
return summary
def tfidf_summary(text, num_sentences=5):
original_sentences, preprocessed_sentences = preprocess_text(text)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences)
sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]]
summary = ' '.join(ranked_sentences[:num_sentences])
return summary
def bart_summary(text):
inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
summary_ids = model.generate(
inputs["input_ids"],
max_length=1000,
min_length=50,
num_beams=4,
early_stopping=True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def extract_text_from_url(url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
paragraphs = soup.find_all('p')
text = ' '.join([para.get_text() for para in paragraphs])
return text
except Exception as e:
return f"Error fetching link: {e}"
def extract_text_from_pdf(pdf_path):
try:
document = fitz.open(pdf_path)
text = ''
for page in document:
text += page.get_text()
return text
except Exception as e:
return f"Error reading PDF: {e}"
def extract_text_from_docx(docx_path):
try:
document = docx.Document(docx_path)
text = ' '.join([para.text for para in document.paragraphs])
return text
except Exception as e:
return f"Error reading DOCX: {e}"
def extract_text_from_file(file):
file_extension = file.name.split('.')[-1].lower()
text = ''
if file_extension == 'pdf':
pdf_reader = PdfReader(file.name)
for page in pdf_reader.pages:
text += page.extract_text()
elif file_extension == 'docx':
doc = docx.Document(file.name)
text = ' '.join([para.text for para in doc.paragraphs])
elif file_extension in ('png', 'jpg', 'jpeg'):
image = Image.open(file.name)
text = pytesseract.image_to_string(image)
elif file_extension == 'txt':
with open(file.name, 'r', encoding='utf-8') as f:
text = f.read()
return text
def extract_text_from_youtube(url):
try:
if "youtube.com" in url:
video_id = url.split('v=')[1].split('&')[0]
elif "youtu.be" in url:
video_id = url.split('/')[-1]
else:
return "Invalid YouTube URL"
transcript = YouTubeTranscriptApi.get_transcript(video_id)
text = ' '.join([item['text'] for item in transcript])
return text
except Exception as e:
return f"Error fetching YouTube transcript: {e}"
def summarize_text(text, file, link, youtube_link, method):
input_text = ""
if text:
input_text = text
elif file:
input_text = extract_text_from_file(file)
elif link:
input_text = extract_text_from_url(link)
elif youtube_link:
input_text = extract_text_from_youtube(youtube_link)
if "Error" in input_text:
return input_text
if method == "TF-IDF":
return tfidf_summary(input_text)
elif method == "TextRank":
return textrank_summary(input_text)
elif method == "Abstractive":
return bart_summary(input_text)
# Create a Gradio interface
interface = gr.Interface(
fn=summarize_text,
inputs=[
gr.Textbox(
lines=8,
placeholder="Paste your text here...",
label="Input Text"
),
gr.File(
label="Upload PDF, DOCX, JPG, PNG, JPEG, or Text Files"
),
gr.Textbox(
lines=1,
placeholder="Enter URL here...",
label="Input Link"
),
gr.Textbox(
lines=1,
placeholder="Enter YouTube video URL here...",
label="Input YouTube Link"
),
gr.Radio(
choices=["TF-IDF", "TextRank", "Abstractive"],
label="Summarization Method",
value="Abstractive"
)
],
outputs=gr.Textbox(
lines=15,
label="Concise Summary"
),
title="Text Summarizer",
description="Get a clear and concise summary of your text!",
theme="default",
)
# Launch the interface
interface.launch(share=True)