Spaces:

sklearn-docs
/

Univariate-feature-selection

Running on CPU Upgrade

App Files Files Community

Univariate-feature-selection / app.py

vumichien

Create app.py

ef72632 over 2 years ago

raw

history blame

4.88 kB

	import gradio as gr
	import time
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.datasets import load_iris
	from sklearn.model_selection import train_test_split
	from sklearn.feature_selection import SelectKBest, f_classif
	from sklearn.pipeline import make_pipeline
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.svm import LinearSVC

	theme = gr.themes.Monochrome(
	primary_hue="indigo",
	secondary_hue="blue",
	neutral_hue="slate",
	)
	model_card = f"""
	## Description

	Univariate feature selection can be used to improve classification accuracy on a noisy dataset.
	In univariate feature selection, each feature is evaluated independently, and a statistical test is used to determine its strength of association with the target variable.
	The most important features are then selected based on their statistical significance, typically using a threshold p-value or a pre-defined number of top features to select.

	In this demo, some noisy (non informative) features are added to the iris dataset then use Support vector machine (SVM) to classify the Iris dataset both before and after applying univariate feature selection.
	The results of the feature selection are presented through p-values and weights of SVMs, which are plotted for comparison.
	The objective of this demo is to evaluate the accuracy of the models and assess the impact of univariate feature selection on the model weights.
	You can play around with different ``number of top features`` and ``random seed``.

	## Dataset

	Iris dataset
	"""
	# The iris dataset
	X, y = load_iris(return_X_y=True)

	# Some noisy data not correlated
	E = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))

	# Add the noisy data to the informative features
	X = np.hstack((X, E))


	def do_train(k_features, random_state):
	# Split dataset to select feature and evaluate the classifier
	X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=random_state)
	selector = SelectKBest(f_classif, k=k_features)
	selector.fit(X_train, y_train)
	scores = -np.log10(selector.pvalues_)
	scores /= scores.max()


	fig1, axes1 = plt.subplots()
	X_indices = np.arange(X.shape[-1])
	axes1.bar(X_indices - 0.05, scores, width=0.2)
	axes1.set_title("Feature univariate score")
	axes1.set_xlabel("Feature number")
	axes1.set_ylabel(r"Univariate score ($-Log(p_{value})$)")

	clf = make_pipeline(MinMaxScaler(), LinearSVC())
	clf.fit(X_train, y_train)

	svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
	svm_weights /= svm_weights.sum()

	clf_selected = make_pipeline(SelectKBest(f_classif, k=k_features), MinMaxScaler(), LinearSVC())
	clf_selected.fit(X_train, y_train)

	svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
	svm_weights_selected /= svm_weights_selected.sum()

	fig2, axes2 = plt.subplots()
	axes2.bar(
	X_indices - 0.45, scores, width=0.2, label=r"Univariate score ($-Log(p_{value})$)"
	)

	axes2.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight")

	axes2.bar(
	X_indices[selector.get_support()] - 0.05,
	svm_weights_selected,
	width=0.2,
	label="SVM weights after selection",
	)

	axes2.set_title("Comparing feature selection")
	axes2.set_xlabel("Feature number")
	axes2.set_yticks(())
	axes2.axis("tight")
	axes2.legend(loc="upper right")

	text = f"Classification accuracy without selecting features: {clf.score(X_test, y_test)100:.2f}%. Classification accuracy after univariate feature selection: {clf_selected.score(X_test, y_test)100:.2f}%"

	return fig1, fig2, text



	with gr.Blocks(theme=theme) as demo:
	gr.Markdown('''
	<div>
	<h1 style='text-align: center'>Univariate Feature Selection</h1>
	</div>
	''')
	gr.Markdown(model_card)
	gr.Markdown("Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the example from <a href=\"https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py\">scikit-learn</a>")
	k_features = gr.Slider(minimum=2, maximum=10, step=1, value=2, label="Number of top features to select")
	random_state = gr.Slider(minimum=0, maximum=2000, step=1, value=0, label="Random seed")
	with gr.Row():
	with gr.Column():
	plot_1 = gr.Plot(label="Univariate score")
	with gr.Column():
	plot_2 = gr.Plot(label="Comparing feature selection")
	with gr.Row():
	resutls = gr.Textbox(label="Results")

	k_features.change(fn=do_train, inputs=[k_features, random_state], outputs=[plot_1, plot_2, resutls])
	random_state.change(fn=do_train, inputs=[k_features, random_state], outputs=[plot_1, plot_2, resutls])

	demo.launch()