Spaces:

rxxnzz
/

sentiment-pilkada

Sleeping

App Files Files Community

sentiment-pilkada / main.py

rxxnzz

Update main.py

04080ca verified over 1 year ago

raw

history blame contribute delete

5.63 kB


	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import pickle
	import nltk
	import tensorflow as tf
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize, sent_tokenize
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
	import seaborn as sns
	from transformers import BertTokenizer
	from transformers import TFBertForSequenceClassification
	nltk.download('punkt')
	nltk.download('stopwords')

	df = pd.read_csv("dataset_tweet_sentiment_pilkada_DKI_2017.csv")

	df.rename(columns={
	'Sentiment': 'sentiment',
	'Pasangan Calon': 'calon',
	'Text Tweet': 'text'
	}, inplace=True)

	df.dropna(inplace=True)

	#preprocessing
	def clean_text(text):
	text = re.sub(r"https?://\S+\|www\.\S+", "", text) #hapus url
	text = re.sub(r"@\S+", "", text) #hapus mention
	text = re.sub(r"#\S+", "", text) #hapus hastag
	text = re.sub(r"\d+", "", text) #hapus nomor
	text = re.sub(r"[^\w\s]", "", text) #hapus tanda baca
	text = re.sub(r"(.)\1{2,}", r"\1", text) #hapus double karakter
	text = text.strip() #hapus spasi di depan dan di belakang
	text = text.lower() #ubah menjadi huruf kecil
	return text

	stopword_pilkada = pd.read_csv("stopword_tweet_pilkada_DKI_2017.csv", header=None)
	stopword_pilkada.columns = ['stopword']

	stop_words = set(stopwords.words('indonesian'))
	additional_sw = set(stopword_pilkada.stopword.values)
	stop_words = stop_words.union(additional_sw)

	def remove_stopwords(text):
	word_tokens = word_tokenize(text)
	filtered_sentence = [w for w in word_tokens if not w in stop_words]
	return " ".join(filtered_sentence)

	def preprocess_text(text):
	text = clean_text(text)
	text = remove_stopwords(text)
	return(text)

	text_to_process = "sangat gak bagus pak ahok"
	processed_text = preprocess_text(text_to_process)
	print(processed_text)

	df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
	df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

	print("Train Data Size: ", len(df_train)) #70%
	print("Validation Data Size: ", len(df_val)) #15%
	print("Test Data Size: ", len(df_test)) #15%

	PRETRAINED_MODEL = "indobenchmark/indobert-base-p2"
	tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
	vocab = tokenizer.get_vocab()

	#mengecek distrubusi data untuk mengetahui panjang maksimal untuk token
	token_lens = []

	for txt in df["text"]:
	tokens = tokenizer.encode(txt)
	token_lens.append(len(tokens))

	MAX_LEN = 60

	df_train['sentiment'] = df_train['sentiment'].map({'positive': 1, 'negative': 0})
	df_val['sentiment'] = df_val['sentiment'].map({'positive': 1, 'negative': 0})

	def encode_sentence(sent):
	return tokenizer.encode_plus(
	sent,
	add_special_tokens =True,
	padding = 'max_length',
	truncation = 'longest_first',
	max_length = MAX_LEN,
	return_attention_mask =True,
	return_token_type_ids=True
	)

	def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
	return{
	"input_ids": input_ids,
	"attention_mask": attention_masks,
	"token_type_ids": token_type_ids,
	}, label

	def encode_dataset(ds, limit=-1):
	input_ids_list = []
	attention_mask_list = []
	token_type_ids_list = []
	label_list = []

	for index, row in ds.iterrows():
	if limit > 0 and index >= limit:
	break

	input_ids, attention_mask, token_type_ids =encode_sentence(row["text"])["input_ids"],\
	encode_sentence(row["text"])["attention_mask"],\
	encode_sentence(row["text"])["token_type_ids"]
	label = row["sentiment"]

	input_ids_list.append(input_ids)
	attention_mask_list.append(attention_mask)
	token_type_ids_list.append(token_type_ids)
	label_list.append(label)

	return tf.data.Dataset.from_tensor_slices((
	input_ids_list,
	attention_mask_list,
	token_type_ids_list,
	label_list
	)).map(map_example_to_dict)

	EPOCH = 1
	BATCH_SIZE = 42
	LEARNING_RATE = 1e-5

	df_train_shuffled = df_train.sample(frac=1, random_state=42)
	train_data = encode_dataset(df_train_shuffled).batch(BATCH_SIZE)
	val_data = encode_dataset(df_val).batch(BATCH_SIZE)
	test_data = encode_dataset(df_test).batch(BATCH_SIZE)

	model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2)

	model.summary()

	optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
	loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
	metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

	model.compile(optimizer, loss=loss, metrics=[metric])

	history = model.fit(
	train_data,
	epochs=EPOCH,
	batch_size=BATCH_SIZE,
	validation_data=val_data
	)


	# Convert string labels to numeric format for the test dataset
	df_test['sentiment'] = df_test['sentiment'].map({'positive': 1, 'negative': 0})

	# Create the test_data with the updated DataFrame
	test_data = encode_dataset(df_test).batch(BATCH_SIZE)

	# Evaluate the model
	model.evaluate(test_data)

	y_pred = model.predict(test_data)
	y_actual = np.concatenate([y for x, y in test_data], axis=0)

	labels = ["negative", "positive"]

	def predict(text):
	input_ids, attention_mask, token_type_ids = encode_sentence(text)["input_ids"],\
	encode_sentence(text)["attention_mask"],\
	encode_sentence(text)["token_type_ids"]
	input_ids = tf.expand_dims(input_ids, 0)
	attention_mask = tf.expand_dims(attention_mask, 0)
	token_type_ids = tf.expand_dims(token_type_ids, 0)

	outputs = model([input_ids, attention_mask, token_type_ids])
	return labels[np.argmax(tf.nn.softmax(outputs[0], axis=1).numpy()[0])]