import gradio as gr import nltk import numpy as np import networkx as nx from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords import string from transformers import BartForConditionalGeneration, BartTokenizer import requests from PyPDF2 import PdfReader from bs4 import BeautifulSoup import fitz # PyMuPDF import docx from PIL import Image import pytesseract from youtube_transcript_api import YouTubeTranscriptApi # Download required NLTK data files nltk.download('all') # Load pre-trained BART model and tokenizer model_name = "facebook/bart-large-cnn" tokenizer = BartTokenizer.from_pretrained(model_name) model = BartForConditionalGeneration.from_pretrained(model_name) def preprocess_text(text): sentences = sent_tokenize(text) stop_words = set(stopwords.words('english')) preprocessed_sentences = [] for sentence in sentences: words = word_tokenize(sentence.lower()) filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation] preprocessed_sentences.append(' '.join(filtered_words)) return sentences, preprocessed_sentences def build_similarity_matrix(sentences): tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(tfidf_matrix) return similarity_matrix def textrank_summary(text, num_sentences=5): original_sentences, preprocessed_sentences = preprocess_text(text) similarity_matrix = build_similarity_matrix(preprocessed_sentences) similarity_graph = nx.from_numpy_array(similarity_matrix) scores = nx.pagerank(similarity_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True) summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]]) return summary def tfidf_summary(text, num_sentences=5): original_sentences, preprocessed_sentences = preprocess_text(text) tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences) sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten() ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]] summary = ' '.join(ranked_sentences[:num_sentences]) return summary def bart_summary(text): inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True) summary_ids = model.generate( inputs["input_ids"], max_length=1000, min_length=50, num_beams=4, early_stopping=True ) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary def extract_text_from_url(url): try: response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') paragraphs = soup.find_all('p') text = ' '.join([para.get_text() for para in paragraphs]) return text except Exception as e: return f"Error fetching link: {e}" def extract_text_from_pdf(pdf_path): try: document = fitz.open(pdf_path) text = '' for page in document: text += page.get_text() return text except Exception as e: return f"Error reading PDF: {e}" def extract_text_from_docx(docx_path): try: document = docx.Document(docx_path) text = ' '.join([para.text for para in document.paragraphs]) return text except Exception as e: return f"Error reading DOCX: {e}" def extract_text_from_file(file): file_extension = file.name.split('.')[-1].lower() text = '' if file_extension == 'pdf': pdf_reader = PdfReader(file.name) for page in pdf_reader.pages: text += page.extract_text() elif file_extension == 'docx': doc = docx.Document(file.name) text = ' '.join([para.text for para in doc.paragraphs]) elif file_extension in ('png', 'jpg', 'jpeg'): image = Image.open(file.name) text = pytesseract.image_to_string(image) elif file_extension == 'txt': with open(file.name, 'r', encoding='utf-8') as f: text = f.read() return text def extract_text_from_youtube(url): try: if "youtube.com" in url: video_id = url.split('v=')[1].split('&')[0] elif "youtu.be" in url: video_id = url.split('/')[-1] else: return "Invalid YouTube URL" transcript = YouTubeTranscriptApi.get_transcript(video_id) text = ' '.join([item['text'] for item in transcript]) return text except Exception as e: return f"Error fetching YouTube transcript: {e}" def summarize_text(text, file, link, youtube_link, method): input_text = "" if text: input_text = text elif file: input_text = extract_text_from_file(file) elif link: input_text = extract_text_from_url(link) elif youtube_link: input_text = extract_text_from_youtube(youtube_link) if "Error" in input_text: return input_text if method == "TF-IDF": return tfidf_summary(input_text) elif method == "TextRank": return textrank_summary(input_text) elif method == "Abstractive": return bart_summary(input_text) # Create a Gradio interface interface = gr.Interface( fn=summarize_text, inputs=[ gr.Textbox( lines=8, placeholder="Paste your text here...", label="Input Text" ), gr.File( label="Upload PDF, DOCX, JPG, PNG, JPEG, or Text Files" ), gr.Textbox( lines=1, placeholder="Enter URL here...", label="Input Link" ), gr.Textbox( lines=1, placeholder="Enter YouTube video URL here...", label="Input YouTube Link" ), gr.Radio( choices=["TF-IDF", "TextRank", "Abstractive"], label="Summarization Method", value="Abstractive" ) ], outputs=gr.Textbox( lines=15, label="Concise Summary" ), title="Text Summarizer", description="Get a clear and concise summary of your text!", theme="default", ) # Launch the interface interface.launch(share=True)