import os
import gradio
import pandas as pd
import psycopg2
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import unicodedata
import json

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


def get_paragraph(row, index):
    ans = ''
    for x in row[index]:
        ans = ans + ' ' + x.lower()
    return ans


def remove_accents(text):
    text = unicodedata.normalize('NFKD', text).encode(
        'ASCII', 'ignore').decode('utf-8')
    return text


def get_clean_text(row, index):
    if not isinstance(row[index], str):
        return ''
    if row[index] == "NULL":
        return ''
    clean_text = ''
    words = word_tokenize(row[index].lower())
    for word in words:
        word = word.replace(',', ' ')
        word = remove_accents(word)
        if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
            clean_text += ' ' + word
    return clean_text


def combine(row, indices):
    ans = ''
    for i in indices:
        ans = ans + ' ' + row[i]
    return ans


stop_words = set(stopwords.words('english'))
query = "SELECT * FROM base_springerdata"

CACHE = {}
SQL_KEY = 'sql'
JOURNAL_COMPLETE = 'journal_complete'
JOURNAL_PARTIAL = 'journal_partial'
VECTORIZER = 'vectorizer'
JOURNAL_TFIDF = 'journal_tfidf'

# Access the secrets
HOST = os.getenv('DATABASE_HOST')
DATABASE = os.getenv('DATABASE_NAME')
USER = os.getenv('DATABASE_USER')
PASSWORD = os.getenv('DATABASE_PASSWORD')
# load sql

def load_sql_data(query):
    if SQL_KEY in CACHE:
        return CACHE[SQL_KEY]
    try:
        conn = psycopg2.connect(
            host=HOST,
            database=DATABASE,
            user=USER,
            password=PASSWORD
        )
        df = pd.read_sql_query(query, conn)
        df = df.drop(['item_doi'], axis=1)

        # Close the database connection
        conn.close()

        CACHE[SQL_KEY] = df
        return df

    except psycopg2.Error:
        # If there is an error connecting to the database, load data from the compressed CSV file
        df = pd.read_csv('compressed_data.bz2', compression='bz2')
        df = df.drop(['item_doi'], axis=1)
        
        CACHE[SQL_KEY] = df
        return df

# main_df
main_df = load_sql_data(query)


# load journal_df
def get_journal_df(df):
    if JOURNAL_PARTIAL in CACHE:
        return CACHE[JOURNAL_PARTIAL]
    journal_art = df.groupby('publication_title')['item_title'].apply(
        list).reset_index(name='Articles')
    journal_art.set_index(['publication_title'], inplace=True)

    journal_auth = df.groupby('publication_title')['authors'].apply(
        list).reset_index(name='authors')
    journal_auth.set_index('publication_title', inplace=True)

    journal_key = df.drop_duplicates(
        subset=["publication_title", "keywords"], keep='first')
    journal_key = journal_key.drop(
        ['item_title', 'authors', 'publication_year', 'url'], axis=1)
    journal_key.set_index(['publication_title'], inplace=True)

    journal_main = journal_art.join([journal_key, journal_auth])
    print('journal_main intial')
    journal_main.reset_index(inplace=True)
    journal_main['Articles'] = journal_main.apply(
        get_paragraph, index='Articles', axis=1)
    journal_main['Articles'] = journal_main.apply(
        get_clean_text, index='Articles', axis=1)
    journal_main['authors'] = journal_main.apply(
        get_paragraph, index='authors', axis=1)
    journal_main['authors'] = journal_main.apply(
        get_clean_text, index='authors', axis=1)
    journal_main['keywords'] = journal_main.apply(
        get_clean_text, index='keywords', axis=1)

    journal_main['Tags'] = journal_main.apply(
        combine, indices=['keywords', 'Articles', 'authors'], axis=1)
    journal_main['Tags'] = journal_main.apply(
        get_clean_text, index='Tags', axis=1)
    CACHE[JOURNAL_PARTIAL] = journal_main
    return journal_main


# Journal Dataframe
journal_main = get_journal_df(main_df)
print('journal_main processed')


# load tfidfs
def get_tfidfs(journal_main):
    if VECTORIZER and JOURNAL_TFIDF in CACHE:
        return CACHE[VECTORIZER], CACHE[JOURNAL_TFIDF]
    vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
    journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
    CACHE[VECTORIZER] = vectorizer
    CACHE[JOURNAL_TFIDF] = journal_tfidf_matrix
    return vectorizer, journal_tfidf_matrix


vectorizer, journal_tfidf_matrix = get_tfidfs(journal_main)
print('tfids and vectorizer for journals completed')


def get_article_df(row):
    article = main_df.loc[main_df['publication_title'] ==
                          journal_main['publication_title'][row.name]].copy()
    article['item_title'] = article.apply(
        get_clean_text, index='item_title', axis=1)
    article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
    article['Tokenized'] = article['item_title'].apply(word_tokenize)
    article['Tagged'] = article['Tokenized'].apply(pos_tag)
    article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
                                                         tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
    article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
    article['Tags'] = article.apply(
        lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
    article = article.drop(['keywords', 'publication_title',
                           'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
    article.reset_index(inplace=True)
    article.set_index('index', inplace=True)
    return article


def get_vectorizer(row):
    vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
    return vectorizer


def get_tfidf_matrix(row):
    tfidf_matrix = row['article_vectorizer'].fit_transform(
        row['article_df']['Tags'])
    return tfidf_matrix


def article_preprocessing(df):
    if JOURNAL_COMPLETE in CACHE:
        return CACHE[JOURNAL_COMPLETE]
    df['article_df'] = df.apply(get_article_df, axis=1)
    df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
    df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
    CACHE[JOURNAL_COMPLETE] = df
    return df


journal_main = article_preprocessing(journal_main)
print('done')


# prediction
journal_threshold = 4


def get_journal_index(user_input):
    user_tfidf = vectorizer.transform([user_input])
    cosine_similarities = cosine_similarity(
        user_tfidf, journal_tfidf_matrix).flatten()
    indices = cosine_similarities.argsort()[::-1]
    top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(
        journal_threshold, len(indices))]
    return top_recommendations


article_threshold = 10


def get_article_recommendations(user_input):
    recommended_journals = get_journal_index(user_input)
    recommendations = []
    for journal_id in recommended_journals:
        user_tfidf = journal_main['article_vectorizer'][journal_id].transform([
                                                                              user_input])
        cosine_similarities = cosine_similarity(
            user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
        indices = cosine_similarities.argsort()[::-1]
        top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
                                       cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
        recommendations += top_recommendation_articles
    recommendations.sort(reverse=True)
    return recommendations


def validation(text):
    words = word_tokenize(text)
    # Perform part-of-speech tagging
    tagged_words = pos_tag(words)
    # Check if any adjective or noun is present
    adjectives = [word for word, pos in tagged_words if pos.startswith('JJ')]
    nouns = [word for word, pos in tagged_words if pos.startswith('NN')]

    result = {}

    if not adjectives and not nouns:
        result['validation'] = 'invalid'
    else:
        adjective_str = ' '.join(adjectives)
        noun_str = ' '.join(nouns)
        combined_sentence = f"{adjective_str} {noun_str}".strip()
        result['validation'] = 'valid'
        result['sentence'] = combined_sentence

    return result


def get_links(user_input):
    check=validation(user_input)
    if check['validation'] == 'valid':
        recommendations = get_article_recommendations(check['sentence'])
        links = []
        for article in recommendations:
            cosine_similarity, article_id, journal_id = article
            link = {
                "title": journal_main['article_df'][journal_id].iloc[article_id, 0],
                "url": journal_main['article_df'][journal_id].iloc[article_id, 1],
                "article_id": int(article_id),
                "journal_id": int(journal_id)
            }
            links.append(link)
        return links
    else: 
        return []


validation_interface = gradio.Interface(
    fn=validation,
    inputs="text",
    outputs=gradio.outputs.JSON(),
    title="Validation API - Testing API of ScholarSync",
    description="API to validate user input"
)


links_interface = gradio.Interface(
    fn=get_links,
    inputs="text",
    outputs=gradio.outputs.JSON(),
    examples=[
        ["AI"],
        ["Biochemicals"],
        ["Rocket Science"]
    ],
    title="Article Links Generator API - Testing API of ScholarSync",
    description="API to generate article recommendations based on user input"
)

# Combine interfaces into a single app
app = gradio.TabbedInterface([links_interface, validation_interface], ["articles link generation", "validation"])

# Run the app
if __name__ == "__main__":
    app.launch()