Spaces:
Running
Running
File size: 6,502 Bytes
16d1baf b0b5b94 16d1baf e43fe26 16d1baf b0b5b94 16d1baf c5c393c 16d1baf c5c393c 16d1baf b0b5b94 16d1baf c5c393c 16d1baf b0b5b94 16d1baf b0b5b94 16d1baf b0b5b94 16d1baf b0b5b94 16d1baf df7310b b0b5b94 df7310b b0b5b94 df7310b b0b5b94 df7310b b0b5b94 16d1baf b0b5b94 16d1baf b0b5b94 16d1baf b0b5b94 16d1baf b0b5b94 16d1baf b0b5b94 16d1baf b0b5b94 16d1baf b0b5b94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import gradio as gr
import nltk
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from transformers import BartForConditionalGeneration, BartTokenizer
import requests
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
import fitz # PyMuPDF
import docx
from PIL import Image
import pytesseract
from youtube_transcript_api import YouTubeTranscriptApi
# Download required NLTK data files
nltk.download('all')
# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
def preprocess_text(text):
sentences = sent_tokenize(text)
stop_words = set(stopwords.words('english'))
preprocessed_sentences = []
for sentence in sentences:
words = word_tokenize(sentence.lower())
filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
preprocessed_sentences.append(' '.join(filtered_words))
return sentences, preprocessed_sentences
def build_similarity_matrix(sentences):
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(tfidf_matrix)
return similarity_matrix
def textrank_summary(text, num_sentences=5):
original_sentences, preprocessed_sentences = preprocess_text(text)
similarity_matrix = build_similarity_matrix(preprocessed_sentences)
similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(similarity_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])
return summary
def tfidf_summary(text, num_sentences=5):
original_sentences, preprocessed_sentences = preprocess_text(text)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences)
sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]]
summary = ' '.join(ranked_sentences[:num_sentences])
return summary
def bart_summary(text):
inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
summary_ids = model.generate(
inputs["input_ids"],
max_length=1000,
min_length=50,
num_beams=4,
early_stopping=True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def extract_text_from_url(url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
paragraphs = soup.find_all('p')
text = ' '.join([para.get_text() for para in paragraphs])
return text
except Exception as e:
return f"Error fetching link: {e}"
def extract_text_from_pdf(pdf_path):
try:
document = fitz.open(pdf_path)
text = ''
for page in document:
text += page.get_text()
return text
except Exception as e:
return f"Error reading PDF: {e}"
def extract_text_from_docx(docx_path):
try:
document = docx.Document(docx_path)
text = ' '.join([para.text for para in document.paragraphs])
return text
except Exception as e:
return f"Error reading DOCX: {e}"
def extract_text_from_file(file):
file_extension = file.name.split('.')[-1].lower()
text = ''
if file_extension == 'pdf':
pdf_reader = PdfReader(file.name)
for page in pdf_reader.pages:
text += page.extract_text()
elif file_extension == 'docx':
doc = docx.Document(file.name)
text = ' '.join([para.text for para in doc.paragraphs])
elif file_extension in ('png', 'jpg', 'jpeg'):
image = Image.open(file.name)
text = pytesseract.image_to_string(image)
elif file_extension == 'txt':
with open(file.name, 'r', encoding='utf-8') as f:
text = f.read()
return text
def extract_text_from_youtube(url):
try:
if "youtube.com" in url:
video_id = url.split('v=')[1].split('&')[0]
elif "youtu.be" in url:
video_id = url.split('/')[-1]
else:
return "Invalid YouTube URL"
transcript = YouTubeTranscriptApi.get_transcript(video_id)
text = ' '.join([item['text'] for item in transcript])
return text
except Exception as e:
return f"Error fetching YouTube transcript: {e}"
def summarize_text(text, file, link, youtube_link, method):
input_text = ""
if text:
input_text = text
elif file:
input_text = extract_text_from_file(file)
elif link:
input_text = extract_text_from_url(link)
elif youtube_link:
input_text = extract_text_from_youtube(youtube_link)
if "Error" in input_text:
return input_text
if method == "TF-IDF":
return tfidf_summary(input_text)
elif method == "TextRank":
return textrank_summary(input_text)
elif method == "Abstractive":
return bart_summary(input_text)
# Create a Gradio interface
interface = gr.Interface(
fn=summarize_text,
inputs=[
gr.Textbox(
lines=8,
placeholder="Paste your text here...",
label="Input Text"
),
gr.File(
label="Upload PDF, DOCX, JPG, PNG, JPEG, or Text Files"
),
gr.Textbox(
lines=1,
placeholder="Enter URL here...",
label="Input Link"
),
gr.Textbox(
lines=1,
placeholder="Enter YouTube video URL here...",
label="Input YouTube Link"
),
gr.Radio(
choices=["TF-IDF", "TextRank", "Abstractive"],
label="Summarization Method",
value="Abstractive"
)
],
outputs=gr.Textbox(
lines=15,
label="Concise Summary"
),
title="Text Summarizer",
description="Get a clear and concise summary of your text!",
theme="default",
)
# Launch the interface
interface.launch(share=True)
|