import gradio as gr
import nltk
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from transformers import BartForConditionalGeneration, BartTokenizer
import requests
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
import docx
from PIL import Image
import pytesseract
from youtube_transcript_api import YouTubeTranscriptApi

# Download required NLTK data files
nltk.download('all')

# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
        preprocessed_sentences.append(' '.join(filtered_words))
    return sentences, preprocessed_sentences

def build_similarity_matrix(sentences):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

def textrank_summary(text, num_sentences=5):
    original_sentences, preprocessed_sentences = preprocess_text(text)
    similarity_matrix = build_similarity_matrix(preprocessed_sentences)
    similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(similarity_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
    summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])
    return summary

def tfidf_summary(text, num_sentences=5):
    original_sentences, preprocessed_sentences = preprocess_text(text)
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences)
    sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
    ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]]
    summary = ' '.join(ranked_sentences[:num_sentences])
    return summary

def bart_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=1000,
        min_length=50,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        text = ' '.join([para.get_text() for para in paragraphs])
        return text
    except Exception as e:
        return f"Error fetching link: {e}"

def extract_text_from_pdf(pdf_path):
    try:
        document = fitz.open(pdf_path)
        text = ''
        for page in document:
            text += page.get_text()
        return text
    except Exception as e:
        return f"Error reading PDF: {e}"

def extract_text_from_docx(docx_path):
    try:
        document = docx.Document(docx_path)
        text = ' '.join([para.text for para in document.paragraphs])
        return text
    except Exception as e:
        return f"Error reading DOCX: {e}"

def extract_text_from_file(file):
    file_extension = file.name.split('.')[-1].lower()
    text = ''
    if file_extension == 'pdf':
        pdf_reader = PdfReader(file.name)
        for page in pdf_reader.pages:
            text += page.extract_text()
    elif file_extension == 'docx':
        doc = docx.Document(file.name)
        text = ' '.join([para.text for para in doc.paragraphs])
    elif file_extension in ('png', 'jpg', 'jpeg'):
        image = Image.open(file.name)
        text = pytesseract.image_to_string(image)
    elif file_extension == 'txt':
        with open(file.name, 'r', encoding='utf-8') as f:
            text = f.read()
    return text

def extract_text_from_youtube(url):
    try:
        if "youtube.com" in url:
            video_id = url.split('v=')[1].split('&')[0]
        elif "youtu.be" in url:
            video_id = url.split('/')[-1]
        else:
            return "Invalid YouTube URL"

        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = ' '.join([item['text'] for item in transcript])
        return text
    except Exception as e:
        return f"Error fetching YouTube transcript: {e}"

def summarize_text(text, file, link, youtube_link, method):
    input_text = ""
    if text:
        input_text = text
    elif file:
        input_text = extract_text_from_file(file)
    elif link:
        input_text = extract_text_from_url(link)
    elif youtube_link:
        input_text = extract_text_from_youtube(youtube_link)
    
    if "Error" in input_text:
        return input_text

    if method == "TF-IDF":
        return tfidf_summary(input_text)
    elif method == "TextRank":
        return textrank_summary(input_text)
    elif method == "Abstractive":
        return bart_summary(input_text)

# Create a Gradio interface
interface = gr.Interface(
    fn=summarize_text,
    inputs=[
        gr.Textbox(
            lines=8,
            placeholder="Paste your text here...",
            label="Input Text"
        ),
        gr.File(
            label="Upload PDF, DOCX, JPG, PNG, JPEG, or Text Files"
        ),
        gr.Textbox(
            lines=1,
            placeholder="Enter URL here...",
            label="Input Link"
        ),
        gr.Textbox(
            lines=1,
            placeholder="Enter YouTube video URL here...",
            label="Input YouTube Link"
        ),
        gr.Radio(
            choices=["TF-IDF", "TextRank", "Abstractive"],
            label="Summarization Method",
            value="Abstractive"
        )
    ],
    outputs=gr.Textbox(
        lines=15,
        label="Concise Summary"
    ),
    title="Text Summarizer",
    description="Get a clear and concise summary of your text!",
    theme="default",
)

# Launch the interface
interface.launch(share=True)