File size: 6,502 Bytes
16d1baf
 
 
 
 
 
 
 
 
b0b5b94
16d1baf
 
 
 
 
 
 
 
 
 
e43fe26
16d1baf
b0b5b94
 
 
 
16d1baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5c393c
16d1baf
 
 
 
 
 
 
 
c5c393c
16d1baf
 
 
 
 
 
 
 
b0b5b94
16d1baf
 
 
 
 
c5c393c
16d1baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b5b94
16d1baf
b0b5b94
16d1baf
 
 
 
 
 
 
b0b5b94
16d1baf
b0b5b94
16d1baf
 
 
 
 
 
 
 
df7310b
b0b5b94
 
 
df7310b
b0b5b94
 
df7310b
b0b5b94
df7310b
 
b0b5b94
 
16d1baf
 
 
 
 
 
 
 
 
 
b0b5b94
16d1baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b5b94
16d1baf
b0b5b94
16d1baf
 
 
b0b5b94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16d1baf
b0b5b94
 
 
 
16d1baf
b0b5b94
 
16d1baf
 
b0b5b94
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import gradio as gr
import nltk
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from transformers import BartForConditionalGeneration, BartTokenizer
import requests
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
import docx
from PIL import Image
import pytesseract
from youtube_transcript_api import YouTubeTranscriptApi

# Download required NLTK data files
nltk.download('all')

# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
        preprocessed_sentences.append(' '.join(filtered_words))
    return sentences, preprocessed_sentences

def build_similarity_matrix(sentences):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

def textrank_summary(text, num_sentences=5):
    original_sentences, preprocessed_sentences = preprocess_text(text)
    similarity_matrix = build_similarity_matrix(preprocessed_sentences)
    similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(similarity_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
    summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])
    return summary

def tfidf_summary(text, num_sentences=5):
    original_sentences, preprocessed_sentences = preprocess_text(text)
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences)
    sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
    ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]]
    summary = ' '.join(ranked_sentences[:num_sentences])
    return summary

def bart_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=1000,
        min_length=50,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        text = ' '.join([para.get_text() for para in paragraphs])
        return text
    except Exception as e:
        return f"Error fetching link: {e}"

def extract_text_from_pdf(pdf_path):
    try:
        document = fitz.open(pdf_path)
        text = ''
        for page in document:
            text += page.get_text()
        return text
    except Exception as e:
        return f"Error reading PDF: {e}"

def extract_text_from_docx(docx_path):
    try:
        document = docx.Document(docx_path)
        text = ' '.join([para.text for para in document.paragraphs])
        return text
    except Exception as e:
        return f"Error reading DOCX: {e}"

def extract_text_from_file(file):
    file_extension = file.name.split('.')[-1].lower()
    text = ''
    if file_extension == 'pdf':
        pdf_reader = PdfReader(file.name)
        for page in pdf_reader.pages:
            text += page.extract_text()
    elif file_extension == 'docx':
        doc = docx.Document(file.name)
        text = ' '.join([para.text for para in doc.paragraphs])
    elif file_extension in ('png', 'jpg', 'jpeg'):
        image = Image.open(file.name)
        text = pytesseract.image_to_string(image)
    elif file_extension == 'txt':
        with open(file.name, 'r', encoding='utf-8') as f:
            text = f.read()
    return text

def extract_text_from_youtube(url):
    try:
        if "youtube.com" in url:
            video_id = url.split('v=')[1].split('&')[0]
        elif "youtu.be" in url:
            video_id = url.split('/')[-1]
        else:
            return "Invalid YouTube URL"

        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = ' '.join([item['text'] for item in transcript])
        return text
    except Exception as e:
        return f"Error fetching YouTube transcript: {e}"

def summarize_text(text, file, link, youtube_link, method):
    input_text = ""
    if text:
        input_text = text
    elif file:
        input_text = extract_text_from_file(file)
    elif link:
        input_text = extract_text_from_url(link)
    elif youtube_link:
        input_text = extract_text_from_youtube(youtube_link)
    
    if "Error" in input_text:
        return input_text

    if method == "TF-IDF":
        return tfidf_summary(input_text)
    elif method == "TextRank":
        return textrank_summary(input_text)
    elif method == "Abstractive":
        return bart_summary(input_text)

# Create a Gradio interface
interface = gr.Interface(
    fn=summarize_text,
    inputs=[
        gr.Textbox(
            lines=8,
            placeholder="Paste your text here...",
            label="Input Text"
        ),
        gr.File(
            label="Upload PDF, DOCX, JPG, PNG, JPEG, or Text Files"
        ),
        gr.Textbox(
            lines=1,
            placeholder="Enter URL here...",
            label="Input Link"
        ),
        gr.Textbox(
            lines=1,
            placeholder="Enter YouTube video URL here...",
            label="Input YouTube Link"
        ),
        gr.Radio(
            choices=["TF-IDF", "TextRank", "Abstractive"],
            label="Summarization Method",
            value="Abstractive"
        )
    ],
    outputs=gr.Textbox(
        lines=15,
        label="Concise Summary"
    ),
    title="Text Summarizer",
    description="Get a clear and concise summary of your text!",
    theme="default",
)

# Launch the interface
interface.launch(share=True)