ShadowProgrammer commited on
Commit
a7eddd3
·
1 Parent(s): 27462dc

Upload 5 files

Browse files
Files changed (5) hide show
  1. model.pkl +3 -0
  2. requirements.txt +0 -0
  3. testing.py +69 -0
  4. train.py +101 -0
  5. vectorizer.pkl +3 -0
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8d5bd11c858325b31519dcad3a8ad8f69075f8edcfb269213705db7df8175e9
3
+ size 277695407
requirements.txt ADDED
Binary file (1.53 kB). View file
 
testing.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ print("Loading libraries...")
4
+ start_time = time.time()
5
+
6
+ import sklearn
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error
9
+ from sklearn.neural_network import MLPRegressor
10
+ from sklearn.feature_extraction.text import CountVectorizer
11
+ import matplotlib.pyplot as plt
12
+ import numpy as np
13
+ import datasets
14
+ import pickle
15
+
16
+ print(f"Libraries loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
17
+
18
+ print("Setting configuration...")
19
+ start_time = time.time()
20
+
21
+ # Set configuration
22
+ sklearn.set_config(working_memory=4096)
23
+ data_size = 100000
24
+
25
+
26
+ print(f"Configuration set in {round((time.time() - start_time) * 1000, 3)} ms.")
27
+
28
+ print("Loading model and vectorizer...")
29
+ start_time = time.time()
30
+
31
+ with open('model.pkl', 'rb') as model_file:
32
+ mlp = pickle.load(model_file)
33
+
34
+ with open('vectorizer.pkl', 'rb') as vectorizer_file:
35
+ count_vect = pickle.load(vectorizer_file)
36
+
37
+ print(f"Model and vectorizer loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
38
+
39
+ print("Loading data...")
40
+ start_time = time.time()
41
+
42
+ # Load data
43
+ dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')
44
+ df = dataset['train'].to_pandas()
45
+
46
+ print(f"Data loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
47
+ print(df.head())
48
+
49
+ print("Fitting vectorizer...")
50
+ start_time = time.time()
51
+
52
+ # Extract text and labels
53
+ X_text = df['text'][:data_size] # Assuming 'text' is the column containing the text data
54
+
55
+ # Convert text to vectors
56
+ X = count_vect.fit(X_text)
57
+
58
+ print(f"Vectorizer fit in {round((time.time() - start_time) * 1000, 3)} ms.")
59
+
60
+ print("Predicting...")
61
+ start_time = time.time()
62
+
63
+ sentences = ["It is about time for all illegals to go back to their country of origin and keep our freeway open and prevent heavy traffic.", "OMG! The EGO\'s of these young, young, inexperienced women display are remarkable! So self absorbed - so, so proud of the mistakes their constituents all made in giving them a bit of power. They\'ll be quite surprised to find that the vast majority of Americans think that these women represent the very WORST of \"modern\" America. Egos out of control !!", "Instagram refugees lmao, let's build a wall instead"]
64
+ vectorized_sentences = count_vect.transform(sentences)
65
+
66
+ predictions = mlp.predict(vectorized_sentences)
67
+ # write dict of sentences and predictions
68
+ values = {sentences[i]: predictions[i] for i in range(len(sentences))}
69
+ print(values)
train.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ print("Loading libraries...")
4
+ start_time = time.time()
5
+
6
+ import sklearn
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error
9
+ from sklearn.neural_network import MLPRegressor
10
+ from sklearn.feature_extraction.text import CountVectorizer
11
+ import matplotlib.pyplot as plt
12
+ import datasets
13
+ import pickle
14
+
15
+ print(f"Libraries loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
16
+ print("Loading vectorizer...")
17
+ start_time = time.time()
18
+
19
+ count_vect = CountVectorizer()
20
+
21
+ print(f"Vectorizer loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
22
+ print(f"Saving vectorizer...")
23
+ start_time = time.time()
24
+
25
+ # Save vectorizer
26
+ pickle.dump(count_vect, open('vectorizer.pkl', 'wb'))
27
+
28
+
29
+ print("Setting configuration...")
30
+ start_time = time.time()
31
+
32
+ # Set configuration
33
+ sklearn.set_config(working_memory=4096)
34
+ data_size = 100000
35
+
36
+
37
+ print(f"Configuration set in {round((time.time() - start_time) * 1000, 3)} ms.")
38
+ print("Loading data...")
39
+ start_time = time.time()
40
+
41
+ # Load data
42
+ dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')
43
+ df = dataset['train'].to_pandas()
44
+
45
+ print(f"Data loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
46
+ print(df.head())
47
+
48
+ print("Preprocessing data...")
49
+ start_time = time.time()
50
+
51
+ # Extract text and labels
52
+ X_text = df['text'][:data_size] # Assuming 'text' is the column containing the text data
53
+ y_columns = ['hate_speech_score', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech']
54
+ y = df[y_columns][:data_size]
55
+ y = y.fillna(0)
56
+
57
+ # Convert text to vectors
58
+ X = count_vect.fit_transform(X_text)
59
+
60
+ print(f"Data preprocessed in {round((time.time() - start_time) * 1000, 3)} ms.")
61
+ print("Splitting data...")
62
+ start_time = time.time()
63
+ # Load data
64
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
65
+
66
+ print(f"Data split in {round((time.time() - start_time) * 1000, 3)} ms.")
67
+ print("Training model...")
68
+ start_time = time.time()
69
+
70
+ # Create MLPRegressor model
71
+ mlp = MLPRegressor(hidden_layer_sizes=(256, 128, 64, 32, 16), activation='relu', max_iter=100, alpha=0.0001, learning_rate_init=0.003, solver='adam', verbose=True, tol=0.000000000001, early_stopping=False, n_iter_no_change=5000)
72
+ mlp.fit(X_train, y_train)
73
+
74
+ print(f"Model trained in {round((time.time() - start_time), 3)} s.")
75
+ print("Evaluating model...")
76
+
77
+ # Predict and score
78
+ predictions = mlp.predict(X_test)
79
+ print("Mean squared error: ", mean_squared_error(y_test, predictions))
80
+
81
+ # Plot the loss curve
82
+ plt.plot(mlp.loss_curve_)
83
+ plt.title("Loss curve")
84
+ plt.xlabel("Iteration")
85
+ plt.ylabel("Loss")
86
+ plt.show()
87
+
88
+ print("Done!")
89
+
90
+ # Save the model to disk
91
+
92
+ filename = 'model.pkl'
93
+ pickle.dump(mlp, open(filename, 'wb'))
94
+
95
+ # Test the model for fun :)
96
+ sentences = count_vect.fit_transform(["Fuck you you stupid nigger", "You're a piece of shit", "Awesome!", "Oh my god, I never realized that!"])
97
+
98
+ predictions = mlp.predict(sentences)
99
+ # Write dict of sentences and predictions
100
+ values = {sentences[i]: predictions[i] for i in range(len(sentences))}
101
+
vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fef08e75c9f3f95fffc3adcac5b216d5cba29accadff7f1bbd02b71e98cf0fae
3
+ size 400