import time import pickle import numpy as np import pandas as pd import torch import torch.nn as nn from sentence_transformers import util import os # Set random seed for reproducibility torch.manual_seed(1) path = os.getcwd() # Load datasets df_inmemory = pd.read_csv(path + '/raw_data/labeled.csv') # labeled text extracted from 230 CSR GRI reports, 150 International companies, 2017-2021 period df_paragraph = pd.read_csv(path + '/raw_data/prediction_demo.csv', encoding='latin1') # Load stored embeddings with open(path + '/embeddings/embeddings_prediction.pkl', "rb") as f: stored_data = pickle.load(f) pred_embeddings = stored_data['parg_embeddings'] with open(path + '/embeddings/embeddings_labeled.pkl', "rb") as f: stored_data = pickle.load(f) embeddings = stored_data['sent_embeddings'] # Define function for cosine similarity search def get_top_n_similar_reports(new_report, report_embeddings, top_n=20): search_hits = util.semantic_search(new_report, report_embeddings, top_k=top_n) top_report_ids = [hit['corpus_id'] for hit in search_hits[0]] similarity_scores = [hit['score'] for hit in search_hits[0]] return pd.DataFrame({'top_report_ids': top_report_ids, 'cosine_similarity': similarity_scores}) # Perform cosine similarity search test_embeddings = pred_embeddings[:50000] all_predictions = [] start = time.time() for i, test_embedding in enumerate(test_embeddings): result_df = get_top_n_similar_reports(test_embedding.reshape(1, -1), embeddings) result = pd.merge(result_df, df_inmemory, left_on='top_report_ids', right_on='index', how='left') all_predictions.append(result) df_all_predictions = pd.concat(all_predictions, keys=range(len(all_predictions)), axis=0) # Apply K-Nearest Neighbor (KNN) algorithm top_n = 12 predict = pd.DataFrame(columns=df_inmemory.columns[6:]) for item in range(len(all_predictions)): k_similar_reports = df_all_predictions.xs(item).nlargest(top_n, ['cosine_similarity']) result_knn = pd.DataFrame(0, index=[0], columns=k_similar_reports.columns[8:]) for i in range(top_n): result_knn += k_similar_reports.iloc[i, 8:].values predict = pd.concat([predict, result_knn], ignore_index=True) # Apply Sigmoid activation function sigmoid = nn.Sigmoid() data_tensor = torch.tensor(predict.to_numpy().astype(float), dtype=torch.float32) output = sigmoid(data_tensor) output = (output > 0.90).float() # Save results output_df = pd.DataFrame(output.numpy(), columns=predict.columns) df_results = pd.concat([df_paragraph.iloc[:50000, :].reset_index(), output_df], axis=1) df_results.to_csv('df_results_0_50k.csv', index=False) print(f"Processing completed in {time.time() - start:.2f} seconds.")