Spaces:

EdBianchi
/

Social_Toximeter

Build error

App Files Files Community

Social_Toximeter / app.py

EdBianchi

Update app.py

2c2d582 almost 2 years ago

raw

history blame contribute delete

4.71 kB

	from tkinter.ttk import setup_master
	from turtle import color
	from sklearn.model_selection import PredefinedSplit
	import streamlit as st
	# UTILITY
	import joblib
	import pickle
	from joblib import load
	# NLP
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk import SnowballStemmer
	from gensim.models.doc2vec import Doc2Vec
	import numpy as np

	comment = ""
	tresh1 = 0.500
	tresh2 = 0.937
	tresh3 = 0.999


	# set page setting
	st.set_page_config(page_title='Toxic Comments')

	# set history var
	if 'history' not in st.session_state:
	st.session_state.history = []

	# import similarity (to be cached)
	@st.cache(persist=True)
	def importModel(filename):
	model = load(filename)
	return model

	@st.cache(persist=True, allow_output_mutation=True)
	def importD2V(filename):
	model = Doc2Vec.load(filename)
	return model

	@st.cache(persist=True)
	def loadPickle(filename):
	file = pickle.load(open(filename, 'rb'))
	return file

	@st.cache(persist=True)
	def loadPunkt():
	nltk.download('punkt')

	with st.spinner('Loading the models, this could take some time...'):
	loadPunkt()
	normalizer = importModel("normalizerD2V.joblib")
	classifier = importModel("toxicCommModel.joblib")
	model_d2v = importD2V("d2v_comments.model")


	# REGEX
	def apply_regex(corpus):
	corpus = re.sub("\S\d\S"," ", corpus)
	corpus = re.sub("\S@\S\s?"," ", corpus)
	corpus = re.sub("\S#\S\s?"," ", corpus)
	corpus = re.sub(r'http\S+', ' ', corpus)
	corpus = re.sub(r'[^a-zA-Z0-9 ]', ' ',corpus)
	corpus = corpus.replace(u'\ufffd', '8')
	corpus = re.sub(' +', ' ', corpus)
	return corpus

	# TOKENIZE TEXT - we use the Spacy library stopwords
	stop_words = loadPickle("stop_words.pkl")

	# TOKENIZE TEXT and STOP WORDS REMOVAL - execution (removes also the words shorter than 2 and longer than 15 chars)
	def tokenize(doc):
	tokens_1 = word_tokenize(str(doc))
	return [word.lower() for word in tokens_1 if len(word) > 1 and len(word) < 15 and word not in stop_words and not word.isdigit()]

	# STEMMING
	stemmer = SnowballStemmer(language="english")
	def applyStemming(listOfTokens):
	return [stemmer.stem(token) for token in listOfTokens]

	# PROBS TO CLASS
	def probs_to_prediction(probs, threshold):
	pred=[]
	for x in probs[:,1]:
	if x>=threshold:
	pred = 1
	else:
	pred = 0
	return pred


	# PROCESSING
	def compute(comment, tresh):
	global preds
	global probs
	global stems
	stems = ""
	preds = []
	comment = apply_regex(comment)
	comment = tokenize(comment)
	comment = applyStemming(comment)
	stems = comment

	vectorizedComment = model_d2v.infer_vector(comment, epochs=70)
	normComment = normalizer.transform([vectorizedComment])
	probs = classifier.predict_proba(normComment)
	for t in tresh:
	preds.append(probs_to_prediction(probs, t))

	with st.container():
	col1, col2, col6 = st.columns(3)
	#col1.metric("Toxic", round(preds[0][1], 4))
	#col2.metric("Non Toxic", round(1-preds[0][1], 4))
	col1.metric("Toxic", round(probs[0][1], 4))
	col2.metric("", "")
	col6.metric("Non Toxic", round(probs[0][0], 4))
	st.markdown("""---""")
	display()
	return None

	def display():
	with st.container():
	st.write("#### Different classification outputs at different threshold values:")
	col3, col4, col5 = st.columns(3)
	col3.metric("", "TOXIC" if preds[0]==1 else "NON TOXIC", delta = 0.500)
	col4.metric("", "TOXIC" if preds[1]==1 else "NON TOXIC", delta = 0.937)
	col5.metric("", "TOXIC" if preds[2]==1 else "NON TOXIC", delta = 0.999)
	st.markdown("""---""")
	with st.container():
	st.write("#### Result of the NLP Pipeline:")
	st.write(stems)
	return None

	# TITLE
	st.write("# ☢️ Toxic Comments Classification ☢️")
	st.write("#### Drop a comment and wait for toxicity.")

	# INPUT TEXTBOX
	comment = st.text_area('', "Drop your comment here! 😎")

	# IMPUT THRESHOLD
	#tresh = st.slider('Set the Threshold, default 0.5', 0.000, 1.000, step=0.0001, value=0.500)
	compute(comment, [tresh1, tresh2, tresh3])

	# sidebar
	st.sidebar.write("""
	This is a Toxic Comment Classifier that uses tokenization, stemming, Doc2Vec encoding and tuned logistic regression model.
	It's been trained on a large corpus of comments.

	A threshold is used to convert the predicted probability of toxicity into a categorical class [toxic, non toxic].

	The value of the threshold can be chosen accordingly to the final application of the classifier.

	Here are presented three sample thresholds to see the differences on the final output.
	""")