Commit
·
1b3143f
1
Parent(s):
0fa3c25
- app.py +68 -0
- lda_model.pkl +3 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
3 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
4 |
+
import pickle
|
5 |
+
|
6 |
+
# Title
|
7 |
+
st.title("Unsupervised Text Analysis App with Training")
|
8 |
+
st.subheader("Train an LDA Model for Topic Modeling")
|
9 |
+
|
10 |
+
# Initialize Session State
|
11 |
+
if "lda_model" not in st.session_state:
|
12 |
+
st.session_state.lda_model = None
|
13 |
+
|
14 |
+
# Built-in Dataset
|
15 |
+
st.write("### Dataset:")
|
16 |
+
texts = [
|
17 |
+
"The economy is experiencing significant growth this year.",
|
18 |
+
"Climate change is one of the most pressing global challenges.",
|
19 |
+
"Artificial intelligence is transforming industries worldwide.",
|
20 |
+
"Renewable energy sources are becoming more popular and cost-effective.",
|
21 |
+
"Sports events bring people together and promote cultural exchange.",
|
22 |
+
"Advances in medicine have greatly improved life expectancy.",
|
23 |
+
"Education plays a critical role in shaping the future of societies.",
|
24 |
+
"Travel and tourism contribute significantly to the global economy.",
|
25 |
+
"Space exploration inspires innovation and collaboration.",
|
26 |
+
"Social media platforms influence public opinion and behavior."
|
27 |
+
]
|
28 |
+
|
29 |
+
# Display dataset
|
30 |
+
st.write(texts)
|
31 |
+
|
32 |
+
# Input: Number of Topics
|
33 |
+
st.subheader("Training Parameters")
|
34 |
+
num_topics = st.slider("Select the number of topics for training", 2, 10, 3)
|
35 |
+
|
36 |
+
# Vectorization
|
37 |
+
vectorizer = CountVectorizer(stop_words="english", max_features=1000)
|
38 |
+
doc_term_matrix = vectorizer.fit_transform(texts)
|
39 |
+
|
40 |
+
# Train LDA Model
|
41 |
+
st.subheader("Training the LDA Model")
|
42 |
+
if st.button("Train Model"):
|
43 |
+
with st.spinner("Training the LDA model..."):
|
44 |
+
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
|
45 |
+
lda.fit(doc_term_matrix)
|
46 |
+
st.session_state.lda_model = lda # Save the trained model in session state
|
47 |
+
|
48 |
+
# Display Topics
|
49 |
+
st.success("Training Completed!")
|
50 |
+
feature_names = vectorizer.get_feature_names_out()
|
51 |
+
topics = []
|
52 |
+
for topic_idx, topic in enumerate(lda.components_):
|
53 |
+
top_features = [feature_names[i] for i in topic.argsort()[:-6:-1]]
|
54 |
+
topics.append(f"Topic {topic_idx + 1}: {', '.join(top_features)}")
|
55 |
+
|
56 |
+
st.write("### Identified Topics:")
|
57 |
+
for topic in topics:
|
58 |
+
st.write(topic)
|
59 |
+
|
60 |
+
# Save the Trained Model
|
61 |
+
st.subheader("Save the Trained Model")
|
62 |
+
if st.button("Save Model"):
|
63 |
+
if st.session_state.lda_model:
|
64 |
+
with open("lda_model.pkl", "wb") as f:
|
65 |
+
pickle.dump(st.session_state.lda_model, f)
|
66 |
+
st.success("Model saved as `lda_model.pkl`.")
|
67 |
+
else:
|
68 |
+
st.error("Please train the model first before saving.")
|
lda_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7e8868198c4e65547d5160c31ec2a311e0a482bc361339495e9b6ad78f48d10
|
3 |
+
size 8871
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
scikit-learn
|
3 |
+
transformers
|