Sai004 commited on
Commit
7c4da79
·
1 Parent(s): c078a46

Updated app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -71
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio
2
  import pandas as pd
3
  import psycopg2
@@ -14,16 +15,20 @@ nltk.download('punkt')
14
  nltk.download('averaged_perceptron_tagger')
15
  nltk.download('stopwords')
16
 
 
17
  def get_paragraph(row, index):
18
  ans = ''
19
  for x in row[index]:
20
  ans = ans + ' ' + x.lower()
21
  return ans
22
 
 
23
  def remove_accents(text):
24
- text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
 
25
  return text
26
 
 
27
  def get_clean_text(row, index):
28
  if not isinstance(row[index], str):
29
  return ''
@@ -38,22 +43,23 @@ def get_clean_text(row, index):
38
  clean_text += ' ' + word
39
  return clean_text
40
 
 
41
  def combine(row, indices):
42
  ans = ''
43
  for i in indices:
44
  ans = ans + ' ' + row[i]
45
  return ans
46
 
 
47
  stop_words = set(stopwords.words('english'))
48
  query = "SELECT * FROM base_springerdata"
49
 
50
- CACHE={}
51
- SQL_KEY='sql'
52
- JOURNAL_COMPLETE='journal_complete'
53
- JOURNAL_PARTIAL='journal_partial'
54
- VECTORIZER='vectorizer'
55
- JOURNAL_TFIDF='journal_tfidf'
56
- import os
57
 
58
  # Access the secrets
59
  HOST = os.getenv('DATABASE_HOST')
@@ -61,131 +67,149 @@ DATABASE = os.getenv('DATABASE_NAME')
61
  USER = os.getenv('DATABASE_USER')
62
  PASSWORD = os.getenv('DATABASE_PASSWORD')
63
  # load sql
 
 
64
  def load_sql_data(query):
65
  if SQL_KEY in CACHE:
66
  return CACHE[SQL_KEY]
67
  conn = psycopg2.connect(
68
- host=HOST,
69
- database=DATABASE,
70
- user=USER,
71
- password=PASSWORD,
72
- sslmode="require"
73
  )
74
- df =pd.read_sql_query(query, conn)
75
  df = df.drop(['item_doi'], axis=1)
 
 
76
  conn.close()
77
  CACHE[SQL_KEY] = df
78
  return df
 
 
79
  # main_df
80
  main_df = load_sql_data(query)
81
- # Close the database connection
82
 
83
 
84
  # load journal_df
85
-
86
  def get_journal_df(df):
87
  if JOURNAL_PARTIAL in CACHE:
88
  return CACHE[JOURNAL_PARTIAL]
89
- journal_art = df.groupby('publication_title')['item_title'].apply(list).reset_index(name='Articles')
 
90
  journal_art.set_index(['publication_title'], inplace=True)
91
 
92
- journal_auth = df.groupby('publication_title')['authors'].apply(list).reset_index(name='authors')
 
93
  journal_auth.set_index('publication_title', inplace=True)
94
 
95
- journal_key = df.drop_duplicates(subset=["publication_title", "keywords"], keep='first')
96
- journal_key = journal_key.drop(['item_title', 'authors', 'publication_year', 'url'], axis=1)
 
 
97
  journal_key.set_index(['publication_title'], inplace=True)
98
 
99
  journal_main = journal_art.join([journal_key, journal_auth])
100
  print('journal_main intial')
101
  journal_main.reset_index(inplace=True)
102
- journal_main['Articles'] = journal_main.apply(get_paragraph, index='Articles', axis=1)
103
- journal_main['Articles'] = journal_main.apply(get_clean_text, index='Articles', axis=1)
104
- journal_main['authors'] = journal_main.apply(get_paragraph, index='authors', axis=1)
105
- journal_main['authors'] = journal_main.apply(get_clean_text, index='authors', axis=1)
106
- journal_main['keywords'] = journal_main.apply(get_clean_text, index='keywords', axis=1)
107
-
108
- journal_main['Tags'] = journal_main.apply(combine, indices=['keywords', 'Articles', 'authors'], axis=1)
109
- journal_main['Tags'] = journal_main.apply(get_clean_text, index='Tags', axis=1)
110
- CACHE[JOURNAL_PARTIAL]=journal_main
 
 
 
 
 
 
 
111
  return journal_main
112
 
113
- journal_main=get_journal_df(main_df)
114
- print('journal_main processed')
115
  # Journal Dataframe
 
 
116
 
117
- # load tfidfs
118
 
 
119
  def get_tfidfs(journal_main):
120
  if VECTORIZER and JOURNAL_TFIDF in CACHE:
121
- return CACHE[VECTORIZER],CACHE[JOURNAL_TFIDF]
122
  vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
123
  journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
124
- CACHE[VECTORIZER]=vectorizer
125
- CACHE[JOURNAL_TFIDF]=journal_tfidf_matrix
126
- return vectorizer,journal_tfidf_matrix
127
 
128
- vectorizer,journal_tfidf_matrix = get_tfidfs(journal_main)
 
129
  print('tfids and vectorizer for journals completed')
130
 
 
131
  def get_article_df(row):
132
- article = main_df.loc[main_df['publication_title'] == journal_main['publication_title'][row.name]].copy()
133
- article['item_title'] = article.apply(get_clean_text, index='item_title', axis=1)
 
 
134
  article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
135
  article['Tokenized'] = article['item_title'].apply(word_tokenize)
136
  article['Tagged'] = article['Tokenized'].apply(pos_tag)
137
  article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
138
  tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
139
  article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
140
- article['Tags'] = article.apply(lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
141
- article = article.drop(['keywords', 'publication_title', 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
 
 
142
  article.reset_index(inplace=True)
143
  article.set_index('index', inplace=True)
144
  return article
145
 
146
 
147
-
148
  def get_vectorizer(row):
149
  vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
150
  return vectorizer
151
 
152
 
153
  def get_tfidf_matrix(row):
154
- tfidf_matrix = row['article_vectorizer'].fit_transform(row['article_df']['Tags'])
 
155
  return tfidf_matrix
156
 
 
157
  def article_preprocessing(df):
158
  if JOURNAL_COMPLETE in CACHE:
159
  return CACHE[JOURNAL_COMPLETE]
160
  df['article_df'] = df.apply(get_article_df, axis=1)
161
  df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
162
  df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
163
- CACHE[JOURNAL_COMPLETE]=df
164
  return df
165
 
166
- journal_main=article_preprocessing(journal_main)
167
- print('done')
168
-
169
-
170
-
171
-
172
-
173
-
174
-
175
-
176
 
 
 
177
 
178
 
179
- # #### prediction
180
  journal_threshold = 4
181
 
 
182
  def get_journal_index(user_input):
183
  user_tfidf = vectorizer.transform([user_input])
184
- cosine_similarities = cosine_similarity(user_tfidf, journal_tfidf_matrix).flatten()
 
185
  indices = cosine_similarities.argsort()[::-1]
186
- top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(journal_threshold, len(indices))]
 
187
  return top_recommendations
188
 
 
189
  article_threshold = 10
190
 
191
 
@@ -193,8 +217,10 @@ def get_article_recommendations(user_input):
193
  recommended_journals = get_journal_index(user_input)
194
  recommendations = []
195
  for journal_id in recommended_journals:
196
- user_tfidf = journal_main['article_vectorizer'][journal_id].transform([user_input])
197
- cosine_similarities = cosine_similarity(user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
 
 
198
  indices = cosine_similarities.argsort()[::-1]
199
  top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
200
  cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
@@ -218,18 +244,18 @@ def get_links(user_input):
218
  return links
219
 
220
 
221
-
222
  gradio_interface = gradio.Interface(
223
- fn=get_links,
224
- inputs="text",
225
- outputs=gradio.outputs.JSON(),
226
- examples=[
227
- ["Jill"],
228
- ["Sam"]
229
- ],
230
- title="REST API with Gradio and Huggingface Spaces",
231
- description="This is a demo of how to build an AI powered REST API with Gradio and Huggingface Spaces – for free! Based on [this article](https://www.tomsoderlund.com/ai/building-ai-powered-rest-api). See the **Use via API** link at the bottom of this page.",
232
- article="© POSA MOKSHITH 2023"
 
233
  )
234
 
235
- gradio_interface.launch()
 
1
+ import os
2
  import gradio
3
  import pandas as pd
4
  import psycopg2
 
15
  nltk.download('averaged_perceptron_tagger')
16
  nltk.download('stopwords')
17
 
18
+
19
  def get_paragraph(row, index):
20
  ans = ''
21
  for x in row[index]:
22
  ans = ans + ' ' + x.lower()
23
  return ans
24
 
25
+
26
  def remove_accents(text):
27
+ text = unicodedata.normalize('NFKD', text).encode(
28
+ 'ASCII', 'ignore').decode('utf-8')
29
  return text
30
 
31
+
32
  def get_clean_text(row, index):
33
  if not isinstance(row[index], str):
34
  return ''
 
43
  clean_text += ' ' + word
44
  return clean_text
45
 
46
+
47
  def combine(row, indices):
48
  ans = ''
49
  for i in indices:
50
  ans = ans + ' ' + row[i]
51
  return ans
52
 
53
+
54
  stop_words = set(stopwords.words('english'))
55
  query = "SELECT * FROM base_springerdata"
56
 
57
+ CACHE = {}
58
+ SQL_KEY = 'sql'
59
+ JOURNAL_COMPLETE = 'journal_complete'
60
+ JOURNAL_PARTIAL = 'journal_partial'
61
+ VECTORIZER = 'vectorizer'
62
+ JOURNAL_TFIDF = 'journal_tfidf'
 
63
 
64
  # Access the secrets
65
  HOST = os.getenv('DATABASE_HOST')
 
67
  USER = os.getenv('DATABASE_USER')
68
  PASSWORD = os.getenv('DATABASE_PASSWORD')
69
  # load sql
70
+
71
+
72
  def load_sql_data(query):
73
  if SQL_KEY in CACHE:
74
  return CACHE[SQL_KEY]
75
  conn = psycopg2.connect(
76
+ host=HOST,
77
+ database=DATABASE,
78
+ user=USER,
79
+ password=PASSWORD
 
80
  )
81
+ df = pd.read_sql_query(query, conn)
82
  df = df.drop(['item_doi'], axis=1)
83
+
84
+ # Close the database connection
85
  conn.close()
86
  CACHE[SQL_KEY] = df
87
  return df
88
+
89
+
90
  # main_df
91
  main_df = load_sql_data(query)
 
92
 
93
 
94
  # load journal_df
 
95
  def get_journal_df(df):
96
  if JOURNAL_PARTIAL in CACHE:
97
  return CACHE[JOURNAL_PARTIAL]
98
+ journal_art = df.groupby('publication_title')['item_title'].apply(
99
+ list).reset_index(name='Articles')
100
  journal_art.set_index(['publication_title'], inplace=True)
101
 
102
+ journal_auth = df.groupby('publication_title')['authors'].apply(
103
+ list).reset_index(name='authors')
104
  journal_auth.set_index('publication_title', inplace=True)
105
 
106
+ journal_key = df.drop_duplicates(
107
+ subset=["publication_title", "keywords"], keep='first')
108
+ journal_key = journal_key.drop(
109
+ ['item_title', 'authors', 'publication_year', 'url'], axis=1)
110
  journal_key.set_index(['publication_title'], inplace=True)
111
 
112
  journal_main = journal_art.join([journal_key, journal_auth])
113
  print('journal_main intial')
114
  journal_main.reset_index(inplace=True)
115
+ journal_main['Articles'] = journal_main.apply(
116
+ get_paragraph, index='Articles', axis=1)
117
+ journal_main['Articles'] = journal_main.apply(
118
+ get_clean_text, index='Articles', axis=1)
119
+ journal_main['authors'] = journal_main.apply(
120
+ get_paragraph, index='authors', axis=1)
121
+ journal_main['authors'] = journal_main.apply(
122
+ get_clean_text, index='authors', axis=1)
123
+ journal_main['keywords'] = journal_main.apply(
124
+ get_clean_text, index='keywords', axis=1)
125
+
126
+ journal_main['Tags'] = journal_main.apply(
127
+ combine, indices=['keywords', 'Articles', 'authors'], axis=1)
128
+ journal_main['Tags'] = journal_main.apply(
129
+ get_clean_text, index='Tags', axis=1)
130
+ CACHE[JOURNAL_PARTIAL] = journal_main
131
  return journal_main
132
 
133
+
 
134
  # Journal Dataframe
135
+ journal_main = get_journal_df(main_df)
136
+ print('journal_main processed')
137
 
 
138
 
139
+ # load tfidfs
140
  def get_tfidfs(journal_main):
141
  if VECTORIZER and JOURNAL_TFIDF in CACHE:
142
+ return CACHE[VECTORIZER], CACHE[JOURNAL_TFIDF]
143
  vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
144
  journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
145
+ CACHE[VECTORIZER] = vectorizer
146
+ CACHE[JOURNAL_TFIDF] = journal_tfidf_matrix
147
+ return vectorizer, journal_tfidf_matrix
148
 
149
+
150
+ vectorizer, journal_tfidf_matrix = get_tfidfs(journal_main)
151
  print('tfids and vectorizer for journals completed')
152
 
153
+
154
  def get_article_df(row):
155
+ article = main_df.loc[main_df['publication_title'] ==
156
+ journal_main['publication_title'][row.name]].copy()
157
+ article['item_title'] = article.apply(
158
+ get_clean_text, index='item_title', axis=1)
159
  article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
160
  article['Tokenized'] = article['item_title'].apply(word_tokenize)
161
  article['Tagged'] = article['Tokenized'].apply(pos_tag)
162
  article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
163
  tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
164
  article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
165
+ article['Tags'] = article.apply(
166
+ lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
167
+ article = article.drop(['keywords', 'publication_title',
168
+ 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
169
  article.reset_index(inplace=True)
170
  article.set_index('index', inplace=True)
171
  return article
172
 
173
 
 
174
  def get_vectorizer(row):
175
  vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
176
  return vectorizer
177
 
178
 
179
  def get_tfidf_matrix(row):
180
+ tfidf_matrix = row['article_vectorizer'].fit_transform(
181
+ row['article_df']['Tags'])
182
  return tfidf_matrix
183
 
184
+
185
  def article_preprocessing(df):
186
  if JOURNAL_COMPLETE in CACHE:
187
  return CACHE[JOURNAL_COMPLETE]
188
  df['article_df'] = df.apply(get_article_df, axis=1)
189
  df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
190
  df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
191
+ CACHE[JOURNAL_COMPLETE] = df
192
  return df
193
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ journal_main = article_preprocessing(journal_main)
196
+ print('done')
197
 
198
 
199
+ # prediction
200
  journal_threshold = 4
201
 
202
+
203
  def get_journal_index(user_input):
204
  user_tfidf = vectorizer.transform([user_input])
205
+ cosine_similarities = cosine_similarity(
206
+ user_tfidf, journal_tfidf_matrix).flatten()
207
  indices = cosine_similarities.argsort()[::-1]
208
+ top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(
209
+ journal_threshold, len(indices))]
210
  return top_recommendations
211
 
212
+
213
  article_threshold = 10
214
 
215
 
 
217
  recommended_journals = get_journal_index(user_input)
218
  recommendations = []
219
  for journal_id in recommended_journals:
220
+ user_tfidf = journal_main['article_vectorizer'][journal_id].transform([
221
+ user_input])
222
+ cosine_similarities = cosine_similarity(
223
+ user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
224
  indices = cosine_similarities.argsort()[::-1]
225
  top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
226
  cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
 
244
  return links
245
 
246
 
 
247
  gradio_interface = gradio.Interface(
248
+ fn=get_links,
249
+ inputs="text",
250
+ outputs=gradio.outputs.JSON(),
251
+ examples=[
252
+ ["AI"],
253
+ ["Biochemicals"],
254
+ ["Rocket Science"]
255
+ ],
256
+ title="Sprinkler Article Generator API",
257
+ description="This is a AI powered REST API with Gradio and Huggingface Spaces – for free! Based on [this article](https://www.tomsoderlund.com/ai/building-ai-powered-rest-api). See the **Use via API** link at the bottom of this page.",
258
+ article="© ScholarSync 2023"
259
  )
260
 
261
+ gradio_interface.launch(share=True)