Spaces:

Nikhil2411
/

textsumerizar1234

Running

File size: 6,502 Bytes

16d1baf
 
 
 
 
 
 
 
 
b0b5b94
16d1baf
 
 
 
 
 
 
 
 
 
e43fe26
16d1baf
b0b5b94
 
 
 
16d1baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5c393c
16d1baf
 
 
 
 
 
 
 
c5c393c
16d1baf
 
 
 
 
 
 
 
b0b5b94
16d1baf
 
 
 
 
c5c393c
16d1baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b5b94
16d1baf
b0b5b94
16d1baf
 
 
 
 
 
 
b0b5b94
16d1baf
b0b5b94
16d1baf
 
 
 
 
 
 
 
df7310b
b0b5b94
 
 
df7310b
b0b5b94
 
df7310b
b0b5b94
df7310b
 
b0b5b94
 
16d1baf
 
 
 
 
 
 
 
 
 
b0b5b94
16d1baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b5b94
16d1baf
b0b5b94
16d1baf
 
 
b0b5b94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16d1baf
b0b5b94
 
 
 
16d1baf
b0b5b94
 
16d1baf
 
b0b5b94

import gradio as gr
import nltk
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from transformers import BartForConditionalGeneration, BartTokenizer
import requests
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
import docx
from PIL import Image
import pytesseract
from youtube_transcript_api import YouTubeTranscriptApi

# Download required NLTK data files
nltk.download('all')

# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
        preprocessed_sentences.append(' '.join(filtered_words))
    return sentences, preprocessed_sentences

def build_similarity_matrix(sentences):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

def textrank_summary(text, num_sentences=5):
    original_sentences, preprocessed_sentences = preprocess_text(text)
    similarity_matrix = build_similarity_matrix(preprocessed_sentences)
    similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(similarity_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
    summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])
    return summary

def tfidf_summary(text, num_sentences=5):
    original_sentences, preprocessed_sentences = preprocess_text(text)
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences)
    sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
    ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]]
    summary = ' '.join(ranked_sentences[:num_sentences])
    return summary

def bart_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=1000,
        min_length=50,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        text = ' '.join([para.get_text() for para in paragraphs])
        return text
    except Exception as e:
        return f"Error fetching link: {e}"

def extract_text_from_pdf(pdf_path):
    try:
        document = fitz.open(pdf_path)
        text = ''
        for page in document:
            text += page.get_text()
        return text
    except Exception as e:
        return f"Error reading PDF: {e}"

def extract_text_from_docx(docx_path):
    try:
        document = docx.Document(docx_path)
        text = ' '.join([para.text for para in document.paragraphs])
        return text
    except Exception as e:
        return f"Error reading DOCX: {e}"

def extract_text_from_file(file):
    file_extension = file.name.split('.')[-1].lower()
    text = ''
    if file_extension == 'pdf':
        pdf_reader = PdfReader(file.name)
        for page in pdf_reader.pages:
            text += page.extract_text()
    elif file_extension == 'docx':
        doc = docx.Document(file.name)
        text = ' '.join([para.text for para in doc.paragraphs])
    elif file_extension in ('png', 'jpg', 'jpeg'):
        image = Image.open(file.name)
        text = pytesseract.image_to_string(image)
    elif file_extension == 'txt':
        with open(file.name, 'r', encoding='utf-8') as f:
            text = f.read()
    return text

def extract_text_from_youtube(url):
    try:
        if "youtube.com" in url:
            video_id = url.split('v=')[1].split('&')[0]
        elif "youtu.be" in url:
            video_id = url.split('/')[-1]
        else:
            return "Invalid YouTube URL"

        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = ' '.join([item['text'] for item in transcript])
        return text
    except Exception as e:
        return f"Error fetching YouTube transcript: {e}"

def summarize_text(text, file, link, youtube_link, method):
    input_text = ""
    if text:
        input_text = text
    elif file:
        input_text = extract_text_from_file(file)
    elif link:
        input_text = extract_text_from_url(link)
    elif youtube_link:
        input_text = extract_text_from_youtube(youtube_link)
    
    if "Error" in input_text:
        return input_text

    if method == "TF-IDF":
        return tfidf_summary(input_text)
    elif method == "TextRank":
        return textrank_summary(input_text)
    elif method == "Abstractive":
        return bart_summary(input_text)

# Create a Gradio interface
interface = gr.Interface(
    fn=summarize_text,
    inputs=[
        gr.Textbox(
            lines=8,
            placeholder="Paste your text here...",
            label="Input Text"
        ),
        gr.File(
            label="Upload PDF, DOCX, JPG, PNG, JPEG, or Text Files"
        ),
        gr.Textbox(
            lines=1,
            placeholder="Enter URL here...",
            label="Input Link"
        ),
        gr.Textbox(
            lines=1,
            placeholder="Enter YouTube video URL here...",
            label="Input YouTube Link"
        ),
        gr.Radio(
            choices=["TF-IDF", "TextRank", "Abstractive"],
            label="Summarization Method",
            value="Abstractive"
        )
    ],
    outputs=gr.Textbox(
        lines=15,
        label="Concise Summary"
    ),
    title="Text Summarizer",
    description="Get a clear and concise summary of your text!",
    theme="default",
)

# Launch the interface
interface.launch(share=True)