Commit
·
a7eddd3
1
Parent(s):
27462dc
Upload 5 files
Browse files- model.pkl +3 -0
- requirements.txt +0 -0
- testing.py +69 -0
- train.py +101 -0
- vectorizer.pkl +3 -0
model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8d5bd11c858325b31519dcad3a8ad8f69075f8edcfb269213705db7df8175e9
|
3 |
+
size 277695407
|
requirements.txt
ADDED
Binary file (1.53 kB). View file
|
|
testing.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
print("Loading libraries...")
|
4 |
+
start_time = time.time()
|
5 |
+
|
6 |
+
import sklearn
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error
|
9 |
+
from sklearn.neural_network import MLPRegressor
|
10 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import numpy as np
|
13 |
+
import datasets
|
14 |
+
import pickle
|
15 |
+
|
16 |
+
print(f"Libraries loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
|
17 |
+
|
18 |
+
print("Setting configuration...")
|
19 |
+
start_time = time.time()
|
20 |
+
|
21 |
+
# Set configuration
|
22 |
+
sklearn.set_config(working_memory=4096)
|
23 |
+
data_size = 100000
|
24 |
+
|
25 |
+
|
26 |
+
print(f"Configuration set in {round((time.time() - start_time) * 1000, 3)} ms.")
|
27 |
+
|
28 |
+
print("Loading model and vectorizer...")
|
29 |
+
start_time = time.time()
|
30 |
+
|
31 |
+
with open('model.pkl', 'rb') as model_file:
|
32 |
+
mlp = pickle.load(model_file)
|
33 |
+
|
34 |
+
with open('vectorizer.pkl', 'rb') as vectorizer_file:
|
35 |
+
count_vect = pickle.load(vectorizer_file)
|
36 |
+
|
37 |
+
print(f"Model and vectorizer loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
|
38 |
+
|
39 |
+
print("Loading data...")
|
40 |
+
start_time = time.time()
|
41 |
+
|
42 |
+
# Load data
|
43 |
+
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')
|
44 |
+
df = dataset['train'].to_pandas()
|
45 |
+
|
46 |
+
print(f"Data loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
|
47 |
+
print(df.head())
|
48 |
+
|
49 |
+
print("Fitting vectorizer...")
|
50 |
+
start_time = time.time()
|
51 |
+
|
52 |
+
# Extract text and labels
|
53 |
+
X_text = df['text'][:data_size] # Assuming 'text' is the column containing the text data
|
54 |
+
|
55 |
+
# Convert text to vectors
|
56 |
+
X = count_vect.fit(X_text)
|
57 |
+
|
58 |
+
print(f"Vectorizer fit in {round((time.time() - start_time) * 1000, 3)} ms.")
|
59 |
+
|
60 |
+
print("Predicting...")
|
61 |
+
start_time = time.time()
|
62 |
+
|
63 |
+
sentences = ["It is about time for all illegals to go back to their country of origin and keep our freeway open and prevent heavy traffic.", "OMG! The EGO\'s of these young, young, inexperienced women display are remarkable! So self absorbed - so, so proud of the mistakes their constituents all made in giving them a bit of power. They\'ll be quite surprised to find that the vast majority of Americans think that these women represent the very WORST of \"modern\" America. Egos out of control !!", "Instagram refugees lmao, let's build a wall instead"]
|
64 |
+
vectorized_sentences = count_vect.transform(sentences)
|
65 |
+
|
66 |
+
predictions = mlp.predict(vectorized_sentences)
|
67 |
+
# write dict of sentences and predictions
|
68 |
+
values = {sentences[i]: predictions[i] for i in range(len(sentences))}
|
69 |
+
print(values)
|
train.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
print("Loading libraries...")
|
4 |
+
start_time = time.time()
|
5 |
+
|
6 |
+
import sklearn
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error
|
9 |
+
from sklearn.neural_network import MLPRegressor
|
10 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import datasets
|
13 |
+
import pickle
|
14 |
+
|
15 |
+
print(f"Libraries loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
|
16 |
+
print("Loading vectorizer...")
|
17 |
+
start_time = time.time()
|
18 |
+
|
19 |
+
count_vect = CountVectorizer()
|
20 |
+
|
21 |
+
print(f"Vectorizer loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
|
22 |
+
print(f"Saving vectorizer...")
|
23 |
+
start_time = time.time()
|
24 |
+
|
25 |
+
# Save vectorizer
|
26 |
+
pickle.dump(count_vect, open('vectorizer.pkl', 'wb'))
|
27 |
+
|
28 |
+
|
29 |
+
print("Setting configuration...")
|
30 |
+
start_time = time.time()
|
31 |
+
|
32 |
+
# Set configuration
|
33 |
+
sklearn.set_config(working_memory=4096)
|
34 |
+
data_size = 100000
|
35 |
+
|
36 |
+
|
37 |
+
print(f"Configuration set in {round((time.time() - start_time) * 1000, 3)} ms.")
|
38 |
+
print("Loading data...")
|
39 |
+
start_time = time.time()
|
40 |
+
|
41 |
+
# Load data
|
42 |
+
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')
|
43 |
+
df = dataset['train'].to_pandas()
|
44 |
+
|
45 |
+
print(f"Data loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
|
46 |
+
print(df.head())
|
47 |
+
|
48 |
+
print("Preprocessing data...")
|
49 |
+
start_time = time.time()
|
50 |
+
|
51 |
+
# Extract text and labels
|
52 |
+
X_text = df['text'][:data_size] # Assuming 'text' is the column containing the text data
|
53 |
+
y_columns = ['hate_speech_score', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech']
|
54 |
+
y = df[y_columns][:data_size]
|
55 |
+
y = y.fillna(0)
|
56 |
+
|
57 |
+
# Convert text to vectors
|
58 |
+
X = count_vect.fit_transform(X_text)
|
59 |
+
|
60 |
+
print(f"Data preprocessed in {round((time.time() - start_time) * 1000, 3)} ms.")
|
61 |
+
print("Splitting data...")
|
62 |
+
start_time = time.time()
|
63 |
+
# Load data
|
64 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
|
65 |
+
|
66 |
+
print(f"Data split in {round((time.time() - start_time) * 1000, 3)} ms.")
|
67 |
+
print("Training model...")
|
68 |
+
start_time = time.time()
|
69 |
+
|
70 |
+
# Create MLPRegressor model
|
71 |
+
mlp = MLPRegressor(hidden_layer_sizes=(256, 128, 64, 32, 16), activation='relu', max_iter=100, alpha=0.0001, learning_rate_init=0.003, solver='adam', verbose=True, tol=0.000000000001, early_stopping=False, n_iter_no_change=5000)
|
72 |
+
mlp.fit(X_train, y_train)
|
73 |
+
|
74 |
+
print(f"Model trained in {round((time.time() - start_time), 3)} s.")
|
75 |
+
print("Evaluating model...")
|
76 |
+
|
77 |
+
# Predict and score
|
78 |
+
predictions = mlp.predict(X_test)
|
79 |
+
print("Mean squared error: ", mean_squared_error(y_test, predictions))
|
80 |
+
|
81 |
+
# Plot the loss curve
|
82 |
+
plt.plot(mlp.loss_curve_)
|
83 |
+
plt.title("Loss curve")
|
84 |
+
plt.xlabel("Iteration")
|
85 |
+
plt.ylabel("Loss")
|
86 |
+
plt.show()
|
87 |
+
|
88 |
+
print("Done!")
|
89 |
+
|
90 |
+
# Save the model to disk
|
91 |
+
|
92 |
+
filename = 'model.pkl'
|
93 |
+
pickle.dump(mlp, open(filename, 'wb'))
|
94 |
+
|
95 |
+
# Test the model for fun :)
|
96 |
+
sentences = count_vect.fit_transform(["Fuck you you stupid nigger", "You're a piece of shit", "Awesome!", "Oh my god, I never realized that!"])
|
97 |
+
|
98 |
+
predictions = mlp.predict(sentences)
|
99 |
+
# Write dict of sentences and predictions
|
100 |
+
values = {sentences[i]: predictions[i] for i in range(len(sentences))}
|
101 |
+
|
vectorizer.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fef08e75c9f3f95fffc3adcac5b216d5cba29accadff7f1bbd02b71e98cf0fae
|
3 |
+
size 400
|