import os import gradio import pandas as pd import psycopg2 import re import nltk from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import unicodedata import json nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('stopwords') def get_paragraph(row, index): ans = '' for x in row[index]: ans = ans + ' ' + x.lower() return ans def remove_accents(text): text = unicodedata.normalize('NFKD', text).encode( 'ASCII', 'ignore').decode('utf-8') return text def get_clean_text(row, index): if not isinstance(row[index], str): return '' if row[index] == "NULL": return '' clean_text = '' words = word_tokenize(row[index].lower()) for word in words: word = word.replace(',', ' ') word = remove_accents(word) if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.': clean_text += ' ' + word return clean_text def combine(row, indices): ans = '' for i in indices: ans = ans + ' ' + row[i] return ans stop_words = set(stopwords.words('english')) query = "SELECT * FROM base_springerdata" CACHE = {} SQL_KEY = 'sql' JOURNAL_COMPLETE = 'journal_complete' JOURNAL_PARTIAL = 'journal_partial' VECTORIZER = 'vectorizer' JOURNAL_TFIDF = 'journal_tfidf' # Access the secrets HOST = os.getenv('DATABASE_HOST') DATABASE = os.getenv('DATABASE_NAME') USER = os.getenv('DATABASE_USER') PASSWORD = os.getenv('DATABASE_PASSWORD') # load sql def load_sql_data(query): if SQL_KEY in CACHE: return CACHE[SQL_KEY] try: conn = psycopg2.connect( host=HOST, database=DATABASE, user=USER, password=PASSWORD ) df = pd.read_sql_query(query, conn) df = df.drop(['item_doi'], axis=1) # Close the database connection conn.close() CACHE[SQL_KEY] = df return df except psycopg2.Error: # If there is an error connecting to the database, load data from the compressed CSV file df = pd.read_csv('compressed_data.bz2', compression='bz2') df = df.drop(['item_doi'], axis=1) CACHE[SQL_KEY] = df return df # main_df main_df = load_sql_data(query) # load journal_df def get_journal_df(df): if JOURNAL_PARTIAL in CACHE: return CACHE[JOURNAL_PARTIAL] journal_art = df.groupby('publication_title')['item_title'].apply( list).reset_index(name='Articles') journal_art.set_index(['publication_title'], inplace=True) journal_auth = df.groupby('publication_title')['authors'].apply( list).reset_index(name='authors') journal_auth.set_index('publication_title', inplace=True) journal_key = df.drop_duplicates( subset=["publication_title", "keywords"], keep='first') journal_key = journal_key.drop( ['item_title', 'authors', 'publication_year', 'url'], axis=1) journal_key.set_index(['publication_title'], inplace=True) journal_main = journal_art.join([journal_key, journal_auth]) print('journal_main intial') journal_main.reset_index(inplace=True) journal_main['Articles'] = journal_main.apply( get_paragraph, index='Articles', axis=1) journal_main['Articles'] = journal_main.apply( get_clean_text, index='Articles', axis=1) journal_main['authors'] = journal_main.apply( get_paragraph, index='authors', axis=1) journal_main['authors'] = journal_main.apply( get_clean_text, index='authors', axis=1) journal_main['keywords'] = journal_main.apply( get_clean_text, index='keywords', axis=1) journal_main['Tags'] = journal_main.apply( combine, indices=['keywords', 'Articles', 'authors'], axis=1) journal_main['Tags'] = journal_main.apply( get_clean_text, index='Tags', axis=1) CACHE[JOURNAL_PARTIAL] = journal_main return journal_main # Journal Dataframe journal_main = get_journal_df(main_df) print('journal_main processed') # load tfidfs def get_tfidfs(journal_main): if VECTORIZER and JOURNAL_TFIDF in CACHE: return CACHE[VECTORIZER], CACHE[JOURNAL_TFIDF] vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii') journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags']) CACHE[VECTORIZER] = vectorizer CACHE[JOURNAL_TFIDF] = journal_tfidf_matrix return vectorizer, journal_tfidf_matrix vectorizer, journal_tfidf_matrix = get_tfidfs(journal_main) print('tfids and vectorizer for journals completed') def get_article_df(row): article = main_df.loc[main_df['publication_title'] == journal_main['publication_title'][row.name]].copy() article['item_title'] = article.apply( get_clean_text, index='item_title', axis=1) article['authors'] = article.apply(get_clean_text, index='authors', axis=1) article['Tokenized'] = article['item_title'].apply(word_tokenize) article['Tagged'] = article['Tokenized'].apply(pos_tag) article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words]) article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1) article['Tags'] = article.apply( lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1) article = article.drop(['keywords', 'publication_title', 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1) article.reset_index(inplace=True) article.set_index('index', inplace=True) return article def get_vectorizer(row): vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii') return vectorizer def get_tfidf_matrix(row): tfidf_matrix = row['article_vectorizer'].fit_transform( row['article_df']['Tags']) return tfidf_matrix def article_preprocessing(df): if JOURNAL_COMPLETE in CACHE: return CACHE[JOURNAL_COMPLETE] df['article_df'] = df.apply(get_article_df, axis=1) df['article_vectorizer'] = df.apply(get_vectorizer, axis=1) df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1) CACHE[JOURNAL_COMPLETE] = df return df journal_main = article_preprocessing(journal_main) print('done') # prediction journal_threshold = 4 def get_journal_index(user_input): user_tfidf = vectorizer.transform([user_input]) cosine_similarities = cosine_similarity( user_tfidf, journal_tfidf_matrix).flatten() indices = cosine_similarities.argsort()[::-1] top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min( journal_threshold, len(indices))] return top_recommendations article_threshold = 10 def get_article_recommendations(user_input): recommended_journals = get_journal_index(user_input) recommendations = [] for journal_id in recommended_journals: user_tfidf = journal_main['article_vectorizer'][journal_id].transform([ user_input]) cosine_similarities = cosine_similarity( user_tfidf, journal_main['article_matrix'][journal_id]).flatten() indices = cosine_similarities.argsort()[::-1] top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if cosine_similarities[i] > 0][:min(article_threshold, len(indices))] recommendations += top_recommendation_articles recommendations.sort(reverse=True) return recommendations def validation(text): words = word_tokenize(text) # Perform part-of-speech tagging tagged_words = pos_tag(words) # Check if any adjective or noun is present adjectives = [word for word, pos in tagged_words if pos.startswith('JJ')] nouns = [word for word, pos in tagged_words if pos.startswith('NN')] result = {} if not adjectives and not nouns: result['validation'] = 'invalid' else: adjective_str = ' '.join(adjectives) noun_str = ' '.join(nouns) combined_sentence = f"{adjective_str} {noun_str}".strip() result['validation'] = 'valid' result['sentence'] = combined_sentence return result def get_links(user_input): check=validation(user_input) if check['validation'] == 'valid': recommendations = get_article_recommendations(check['sentence']) links = [] for article in recommendations: cosine_similarity, article_id, journal_id = article link = { "title": journal_main['article_df'][journal_id].iloc[article_id, 0], "url": journal_main['article_df'][journal_id].iloc[article_id, 1], "article_id": int(article_id), "journal_id": int(journal_id) } links.append(link) return links else: return [] validation_interface = gradio.Interface( fn=validation, inputs="text", outputs=gradio.outputs.JSON(), title="Validation API - Testing API of ScholarSync", description="API to validate user input" ) links_interface = gradio.Interface( fn=get_links, inputs="text", outputs=gradio.outputs.JSON(), examples=[ ["AI"], ["Biochemicals"], ["Rocket Science"] ], title="Article Links Generator API - Testing API of ScholarSync", description="API to generate article recommendations based on user input" ) # Combine interfaces into a single app app = gradio.TabbedInterface([links_interface, validation_interface], ["articles link generation", "validation"]) # Run the app if __name__ == "__main__": app.launch()