Spaces:

Nikhil2411
/

textsumerizar1234

Running

App Files Files Community

textsumerizar1234 / app.py

Nikhil2411

Update app.py

e43fe26 verified 10 months ago

raw

history blame contribute delete

6.5 kB

	import gradio as gr
	import nltk
	import numpy as np
	import networkx as nx
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import stopwords
	import string
	from transformers import BartForConditionalGeneration, BartTokenizer
	import requests
	from PyPDF2 import PdfReader
	from bs4 import BeautifulSoup
	import fitz # PyMuPDF
	import docx
	from PIL import Image
	import pytesseract
	from youtube_transcript_api import YouTubeTranscriptApi

	# Download required NLTK data files
	nltk.download('all')

	# Load pre-trained BART model and tokenizer
	model_name = "facebook/bart-large-cnn"
	tokenizer = BartTokenizer.from_pretrained(model_name)
	model = BartForConditionalGeneration.from_pretrained(model_name)

	def preprocess_text(text):
	sentences = sent_tokenize(text)
	stop_words = set(stopwords.words('english'))
	preprocessed_sentences = []
	for sentence in sentences:
	words = word_tokenize(sentence.lower())
	filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
	preprocessed_sentences.append(' '.join(filtered_words))
	return sentences, preprocessed_sentences

	def build_similarity_matrix(sentences):
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
	similarity_matrix = cosine_similarity(tfidf_matrix)
	return similarity_matrix

	def textrank_summary(text, num_sentences=5):
	original_sentences, preprocessed_sentences = preprocess_text(text)
	similarity_matrix = build_similarity_matrix(preprocessed_sentences)
	similarity_graph = nx.from_numpy_array(similarity_matrix)
	scores = nx.pagerank(similarity_graph)
	ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
	summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])
	return summary

	def tfidf_summary(text, num_sentences=5):
	original_sentences, preprocessed_sentences = preprocess_text(text)
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences)
	sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
	ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]]
	summary = ' '.join(ranked_sentences[:num_sentences])
	return summary

	def bart_summary(text):
	inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
	summary_ids = model.generate(
	inputs["input_ids"],
	max_length=1000,
	min_length=50,
	num_beams=4,
	early_stopping=True
	)
	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
	return summary

	def extract_text_from_url(url):
	try:
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'html.parser')
	paragraphs = soup.find_all('p')
	text = ' '.join([para.get_text() for para in paragraphs])
	return text
	except Exception as e:
	return f"Error fetching link: {e}"

	def extract_text_from_pdf(pdf_path):
	try:
	document = fitz.open(pdf_path)
	text = ''
	for page in document:
	text += page.get_text()
	return text
	except Exception as e:
	return f"Error reading PDF: {e}"

	def extract_text_from_docx(docx_path):
	try:
	document = docx.Document(docx_path)
	text = ' '.join([para.text for para in document.paragraphs])
	return text
	except Exception as e:
	return f"Error reading DOCX: {e}"

	def extract_text_from_file(file):
	file_extension = file.name.split('.')[-1].lower()
	text = ''
	if file_extension == 'pdf':
	pdf_reader = PdfReader(file.name)
	for page in pdf_reader.pages:
	text += page.extract_text()
	elif file_extension == 'docx':
	doc = docx.Document(file.name)
	text = ' '.join([para.text for para in doc.paragraphs])
	elif file_extension in ('png', 'jpg', 'jpeg'):
	image = Image.open(file.name)
	text = pytesseract.image_to_string(image)
	elif file_extension == 'txt':
	with open(file.name, 'r', encoding='utf-8') as f:
	text = f.read()
	return text

	def extract_text_from_youtube(url):
	try:
	if "youtube.com" in url:
	video_id = url.split('v=')[1].split('&')[0]
	elif "youtu.be" in url:
	video_id = url.split('/')[-1]
	else:
	return "Invalid YouTube URL"

	transcript = YouTubeTranscriptApi.get_transcript(video_id)
	text = ' '.join([item['text'] for item in transcript])
	return text
	except Exception as e:
	return f"Error fetching YouTube transcript: {e}"

	def summarize_text(text, file, link, youtube_link, method):
	input_text = ""
	if text:
	input_text = text
	elif file:
	input_text = extract_text_from_file(file)
	elif link:
	input_text = extract_text_from_url(link)
	elif youtube_link:
	input_text = extract_text_from_youtube(youtube_link)

	if "Error" in input_text:
	return input_text

	if method == "TF-IDF":
	return tfidf_summary(input_text)
	elif method == "TextRank":
	return textrank_summary(input_text)
	elif method == "Abstractive":
	return bart_summary(input_text)

	# Create a Gradio interface
	interface = gr.Interface(
	fn=summarize_text,
	inputs=[
	gr.Textbox(
	lines=8,
	placeholder="Paste your text here...",
	label="Input Text"
	),
	gr.File(
	label="Upload PDF, DOCX, JPG, PNG, JPEG, or Text Files"
	),
	gr.Textbox(
	lines=1,
	placeholder="Enter URL here...",
	label="Input Link"
	),
	gr.Textbox(
	lines=1,
	placeholder="Enter YouTube video URL here...",
	label="Input YouTube Link"
	),
	gr.Radio(
	choices=["TF-IDF", "TextRank", "Abstractive"],
	label="Summarization Method",
	value="Abstractive"
	)
	],
	outputs=gr.Textbox(
	lines=15,
	label="Concise Summary"
	),
	title="Text Summarizer",
	description="Get a clear and concise summary of your text!",
	theme="default",
	)

	# Launch the interface
	interface.launch(share=True)