{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "96c060d974614a919a7ae28317c7a653": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_b08ae74bfd4349719c403d1eef1156d9", "IPY_MODEL_981026ae1801465f90e4279b2e938650", "IPY_MODEL_76ed2df981174127a4fb457e08f7ae6f" ], "layout": "IPY_MODEL_5a249661e55542ed9b413bef9aaddad1" } }, "b08ae74bfd4349719c403d1eef1156d9": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e2ea150161db4c66801d702c982bd0c3", "placeholder": "", "style": "IPY_MODEL_975e5cf5cc47439db4009ec45d5952c1", "value": "Downloading data files: 100%" } }, "981026ae1801465f90e4279b2e938650": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_154f436cc6444e2cacabe8ad499a95a6", "max": 1, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_35f99919db084efc8130aadceb5531ae", "value": 1 } }, "76ed2df981174127a4fb457e08f7ae6f": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_018243a53eb84669a4c38d503261e8f9", "placeholder": "", "style": "IPY_MODEL_4dfdf115d3cd4972a3731e7f1ca70904", "value": " 1/1 [00:00<00:00, 46.60it/s]" } }, "5a249661e55542ed9b413bef9aaddad1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e2ea150161db4c66801d702c982bd0c3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "975e5cf5cc47439db4009ec45d5952c1": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "154f436cc6444e2cacabe8ad499a95a6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "35f99919db084efc8130aadceb5531ae": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "018243a53eb84669a4c38d503261e8f9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4dfdf115d3cd4972a3731e7f1ca70904": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "3108b1c2a7b648c6a5503801936a56a7": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_52f05b987e4c4438a8b642d0e33a202b", "IPY_MODEL_1dc053e5bc4b497fa1b7db2c5576f94c", "IPY_MODEL_e3b146645ea5496d9cf16bdaadef7bb2" ], "layout": "IPY_MODEL_712d32d9410947acb90760dd5702bac0" } }, "52f05b987e4c4438a8b642d0e33a202b": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fa054be8ac6543d8942303e1c2a9e244", "placeholder": "", "style": "IPY_MODEL_933ed439461b495eb64204e2f1f72ae2", "value": "Extracting data files: 100%" } }, "1dc053e5bc4b497fa1b7db2c5576f94c": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_391b297ec3e94213804664d426b10044", "max": 1, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_84827cbf438245caa7d86c48a831ac11", "value": 1 } }, "e3b146645ea5496d9cf16bdaadef7bb2": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1afef6c35da447e0a1082210b04c7873", "placeholder": "", "style": "IPY_MODEL_c6d0b35c38794c75b16bb66c073daf49", "value": " 1/1 [00:00<00:00, 32.28it/s]" } }, "712d32d9410947acb90760dd5702bac0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fa054be8ac6543d8942303e1c2a9e244": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "933ed439461b495eb64204e2f1f72ae2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "391b297ec3e94213804664d426b10044": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "84827cbf438245caa7d86c48a831ac11": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "1afef6c35da447e0a1082210b04c7873": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c6d0b35c38794c75b16bb66c073daf49": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ac8e632a9dda4d6dbf7994751cb8f79d": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_999df8a630e542ca987ec574e9bd6574", "IPY_MODEL_f009244bc81d4421b22b63d3f05ac8b3", "IPY_MODEL_28921ed4f2e5473084343823fd49d451" ], "layout": "IPY_MODEL_9ff4fa68c75a4496bd82ed287ffd0bee" } }, "999df8a630e542ca987ec574e9bd6574": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_9c9a39536b07435880c64ecb51bd1234", "placeholder": "", "style": "IPY_MODEL_0269fee64b4f4cb7a1aa577c947bb239", "value": "Generating train split: " } }, "f009244bc81d4421b22b63d3f05ac8b3": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e5113d3c589f4bb4a1a317ae612bc4ce", "max": 1, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_829c395d934f43539638eddc7ee967d8", "value": 1 } }, "28921ed4f2e5473084343823fd49d451": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_99ea1b0fad9446f5b13ae8a7eab39fb4", "placeholder": "", "style": "IPY_MODEL_4de7f5bc3c8a4c59b1b98a0d01020143", "value": " 3000/0 [00:00<00:00, 11307.46 examples/s]" } }, "9ff4fa68c75a4496bd82ed287ffd0bee": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9c9a39536b07435880c64ecb51bd1234": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0269fee64b4f4cb7a1aa577c947bb239": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e5113d3c589f4bb4a1a317ae612bc4ce": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "20px" } }, "829c395d934f43539638eddc7ee967d8": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "99ea1b0fad9446f5b13ae8a7eab39fb4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4de7f5bc3c8a4c59b1b98a0d01020143": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "72edeed1577a4280978f8d7da3a6e872": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_3c1528dba4bc41bb9ea0067b6598dac8", "IPY_MODEL_52290dce6801444584f21fe703b47547", "IPY_MODEL_7fea5e318d5c403889c80b83d86c8abe" ], "layout": "IPY_MODEL_1aae22882ab942efb2092344505b04cc" } }, "3c1528dba4bc41bb9ea0067b6598dac8": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5be8ca37e58540cc800728d468bdef88", "placeholder": "", "style": "IPY_MODEL_835a2a6310af4c37b8d5e036ae61525e", "value": "Map (num_proc=2): 100%" } }, "52290dce6801444584f21fe703b47547": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_af5313b767db44009c5f96b2825acf31", "max": 100, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_a7b5b13b76bd40fca27673b87628a102", "value": 100 } }, "7fea5e318d5c403889c80b83d86c8abe": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7e0671da065c4a2aa75f8bd7e8bf688f", "placeholder": "", "style": "IPY_MODEL_5d348e89e0524852a451dd63067fff08", "value": " 100/100 [01:03<00:00, 1.60 examples/s]" } }, "1aae22882ab942efb2092344505b04cc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5be8ca37e58540cc800728d468bdef88": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "835a2a6310af4c37b8d5e036ae61525e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "af5313b767db44009c5f96b2825acf31": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a7b5b13b76bd40fca27673b87628a102": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "7e0671da065c4a2aa75f8bd7e8bf688f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5d348e89e0524852a451dd63067fff08": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "179c516cf716481a98baa336461296ad": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_a7a5f724aceb4ae787ccf0d787f101d0", "IPY_MODEL_ee53c5ec1ef9460881772bfba3b5dde4", "IPY_MODEL_9b4b42d5d8504f4ab03dd6419cbe2fb9" ], "layout": "IPY_MODEL_3afc58c0610b44a996fbdf42bfeb1c9e" } }, "a7a5f724aceb4ae787ccf0d787f101d0": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7db80f9601494ba48db92afa822d21ca", "placeholder": "", "style": "IPY_MODEL_49c953d09fe5499798f9a200a8f61574", "value": "Map (num_proc=2): 100%" } }, "ee53c5ec1ef9460881772bfba3b5dde4": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6fb7add448204aa4a9baab2bf87e99ae", "max": 100, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_fca07894b1eb48b49e4421e251724886", "value": 100 } }, "9b4b42d5d8504f4ab03dd6419cbe2fb9": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_42360ce02ae0463a90cd06aff246284c", "placeholder": "", "style": "IPY_MODEL_72b29dbbdb264a7890674316a53549ce", "value": " 100/100 [00:02<00:00, 45.91 examples/s]" } }, "3afc58c0610b44a996fbdf42bfeb1c9e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7db80f9601494ba48db92afa822d21ca": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "49c953d09fe5499798f9a200a8f61574": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "6fb7add448204aa4a9baab2bf87e99ae": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fca07894b1eb48b49e4421e251724886": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "42360ce02ae0463a90cd06aff246284c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "72b29dbbdb264a7890674316a53549ce": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "ErNT0Hofo3t9" }, "outputs": [], "source": [ "# !pip install -U FlagEmbedding" ] }, { "cell_type": "code", "source": [ "%%capture\n", "!pip install sentence_transformers\n", "!pip install datasets" ], "metadata": { "id": "8qnnLMkKfWQi" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "import transformers\n", "\n", "print(transformers.__version__)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tJkdcGQontiA", "outputId": "0c77e8c7-dbdd-478c-eda5-1521a5ffef33" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "4.35.2\n" ] } ] }, { "cell_type": "code", "source": [ "# load datasets" ], "metadata": { "id": "Po8C4K_FpcSZ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"csv\", data_files=\"/content/Precily_Text_Similarity.csv\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 113, "referenced_widgets": [ "96c060d974614a919a7ae28317c7a653", "b08ae74bfd4349719c403d1eef1156d9", "981026ae1801465f90e4279b2e938650", "76ed2df981174127a4fb457e08f7ae6f", "5a249661e55542ed9b413bef9aaddad1", "e2ea150161db4c66801d702c982bd0c3", "975e5cf5cc47439db4009ec45d5952c1", "154f436cc6444e2cacabe8ad499a95a6", "35f99919db084efc8130aadceb5531ae", "018243a53eb84669a4c38d503261e8f9", "4dfdf115d3cd4972a3731e7f1ca70904", "3108b1c2a7b648c6a5503801936a56a7", "52f05b987e4c4438a8b642d0e33a202b", "1dc053e5bc4b497fa1b7db2c5576f94c", "e3b146645ea5496d9cf16bdaadef7bb2", "712d32d9410947acb90760dd5702bac0", "fa054be8ac6543d8942303e1c2a9e244", "933ed439461b495eb64204e2f1f72ae2", "391b297ec3e94213804664d426b10044", "84827cbf438245caa7d86c48a831ac11", "1afef6c35da447e0a1082210b04c7873", "c6d0b35c38794c75b16bb66c073daf49", "ac8e632a9dda4d6dbf7994751cb8f79d", "999df8a630e542ca987ec574e9bd6574", "f009244bc81d4421b22b63d3f05ac8b3", "28921ed4f2e5473084343823fd49d451", "9ff4fa68c75a4496bd82ed287ffd0bee", "9c9a39536b07435880c64ecb51bd1234", "0269fee64b4f4cb7a1aa577c947bb239", "e5113d3c589f4bb4a1a317ae612bc4ce", "829c395d934f43539638eddc7ee967d8", "99ea1b0fad9446f5b13ae8a7eab39fb4", "4de7f5bc3c8a4c59b1b98a0d01020143" ] }, "id": "OKRbrhLwk3-7", "outputId": "5e2a92a4-dcff-4125-eec2-0ae068ac8ee2" }, "execution_count": 3, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00, ?it/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "96c060d974614a919a7ae28317c7a653" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Extracting data files: 0%| | 0/1 [00:00, ?it/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "3108b1c2a7b648c6a5503801936a56a7" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Generating train split: 0 examples [00:00, ? examples/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "ac8e632a9dda4d6dbf7994751cb8f79d" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "dataset['train'][100]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b_x-1ZxelWOc", "outputId": "64b75457-0169-47e1-9695-e45978beac96" }, "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'text1': 'gardener wins double in glasgow britain s jason gardener enjoyed a double 60m success in glasgow in his first competitive outing since he won 100m relay gold at the athens olympics. gardener cruised home ahead of scot nick smith to win the invitational race at the norwich union international. he then recovered from a poor start in the second race to beat swede daniel persson and italy s luca verdecchia. his times of 6.61 and 6.62 seconds were well short of american maurice greene s 60m world record of 6.39secs from 1998. it s a very hard record to break but i believe i ve trained very well said the world indoor champion who hopes to get closer to the mark this season. it was important to come out and make sure i got maximum points. my last race was the olympic final and there was a lot of expectation. this was just what i needed to sharpen up and get some race fitness. i m very excited about the next couple of months. double olympic champion marked her first appearance on home soil since winning 1500m and 800m gold in athens with a victory. there was a third success for britain when edged out russia s olga fedorova and sweden s jenny kallur to win the women s 60m race in 7.23secs. maduaka was unable to repeat the feat in the 200m finishing down in fourth as took the win for russia. and the 31-year-old also missed out on a podium place in the 4x200m relay as the british quartet came in fourth with russia setting a new world indoor record. there was a setback for jade johnson as she suffered a recurrence of her back injury in the long jump. russia won the meeting with a final total of 63 points with britain second on 48 and france one point behind in third. led the way for russia by producing a major shock in the high jump as he beat olympic champion stefan holm into second place to end the swede s 22-event unbeaten record. won the triple jump with a leap of 16.87m with britain s tosin oke fourth in 15.80m. won the men s pole vault competition with a clearance of 5.65m with britain s nick buckfield 51cm adrift of his personal best in third. and won the women s 800m with britain s jenny meadows third. there was yet another russian victory in the women s 400m as finished well clear of britain s catherine murphy. chris lambert had to settle for fourth after fading in the closing stages of the men s 200m race as sweden s held off leslie djhone of france. france s won the men s 400m with brett rund fourth for britain. took victory for sweden in the women s 60m hurdles ahead of russia s irina shevchenko and britain s sarah claxton who set a new personal best. italy grabbed their first victory in the men s 1500m as kicked over the last 200 metres to hold off britain s james thie and france s alexis abraham. a botched changeover in the 4x200m relay cost britain s men the chance to add further points as france claimed victory.',\n", " 'text2': 'greek sprinters suspended by iaaf greek sprinters kostas kenteris and katerina thanou have been suspended after failing to take drugs tests before the athens olympics. athletics ruling body the iaaf said explanations from the pair and their former coach as to why they missed the tests were unacceptable . it added that kenteris and thanou had been provisionally suspended pending the resolution of their cases . they face two-year bans if found guilty by the greek athletics federation. the suspension also covers the athletes controversial coach christos tzekos. kenteris the 2000 olympic 200m champion and thanou the women s 100m silver medallist from the same games in sydney also face a criminal hearing in greece over the missed tests. they failed to appear to give samples in chicago and tel aviv shortly before the athens games and again in athens on 12 august the eve of the opening ceremony. greek prosecutors have also charged them with faking a midnight motorcycle crash which led to them spending four days in hospital. some medical staff have been charged with writing false medical reports. wednesday s statement said the greek federation (segas) would convene a disciplinary hearing for the trio to determine whether there had been doping violations. there will be a final right of appeal from the decision of the greek federation to the court of arbitration for sport the iaaf said. tzekos insisted he and the runners had nothing to hide. the iaaf s decision means nothing he said. we ll be presenting all our arguments to segas - we re innocent.'}" ] }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "code", "source": [ "text1 = dataset['train'][100]['text1']\n", "text2 = dataset['train'][100]['text2']" ], "metadata": { "id": "2ajGk5Jj-DvW" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "# BGE embeddings\n" ], "metadata": { "id": "1PRL4gtPfgPI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# from huggingface_hub import snapshot_download\n", "# snapshot_download(repo_id=\"BAAI/bge-small-en-v1.5\", local_dir='models/BAAI/bge-small-en-v1.5')\n", "from sentence_transformers import SentenceTransformer, util\n", "\n", "bge_model = SentenceTransformer('BAAI/bge-small-en-v1.5', device='cpu', cache_folder=\".\")\n", "\n" ], "metadata": { "id": "82yO69j9I8ch" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "from sentence_transformers import SentenceTransformer, util\n", "# Two lists of sentences\n", "# sentences_1 = [\"broadband challenges tv viewing the number of europeans with broadband has exploded over the past 12 months with the web eating into tv viewing habits research suggests. just over 54 million people are hooked up to the net via broadband up from 34 million a year ago according to market analysts nielsen/netratings. the total number of people online in europe has broken the 100 million mark. the popularity of the net has meant that many are turning away from tv say analysts jupiter research. it found that a quarter of web users said they spent less time watching tv in favour of the net the report by nielsen/netratings found that the number of people with fast internet access had risen by 60% over the past year. the biggest jump was in italy where it rose by 120%. britain was close behind with broadband users almost doubling in a year. the growth has been fuelled by lower prices and a wider choice of always-on fast-net subscription plans. twelve months ago high speed internet users made up just over one third of the audience in europe; now they are more than 50% and we expect this number to keep growing said gabrielle prior nielsen/netratings analyst. as the number of high-speed surfers grows websites will need to adapt update and enhance their content to retain their visitors and encourage new ones. the total number of europeans online rose by 12% to 100 million over the past year the report showed with the biggest rise in france italy britain and germany. the ability to browse web pages at high speed download files such as music or films and play online games is changing what people do in their spare time. a study by analysts jupiter research suggested that broadband was challenging television viewing habits. in homes with broadband 40% said they were spending less time watching tv. the threat to tv was greatest in countries where broadband was on the up in particular the uk france and spain said the report. it said tv companies faced a major long-term threat over the next five years with broadband predicted to grow from 19% to 37% of households by 2009. year-on-year we are continuing to see a seismic shift in where when and how europe s population consume media for information and entertainment and this has big implications for tv newspaper and radio said jupiter research analyst olivier beauvillian.,gardener wins double in glasgow britain s jason gardener enjoyed a double 60m success in glasgow in his first competitive outing since he won 100m relay gold at the athens olympics. gardener cruised home ahead of scot nick smith to win the invitational race at the norwich union international. he then recovered from a poor start in the second race to beat swede daniel persson and italy s luca verdecchia. his times of 6.61 and 6.62 seconds were well short of american maurice greene s 60m world record of 6.39secs from 1998. it s a very hard record to break but i believe i ve trained very well said the world indoor champion who hopes to get closer to the mark this season. it was important to come out and make sure i got maximum points. my last race was the olympic final and there was a lot of expectation. this was just what i needed to sharpen up and get some race fitness. i m very excited about the next couple of months. double olympic champion marked her first appearance on home soil since winning 1500m and 800m gold in athens with a victory. there was a third success for britain when edged out russia s olga fedorova and sweden s jenny kallur to win the women s 60m race in 7.23secs. maduaka was unable to repeat the feat in the 200m finishing down in fourth as took the win for russia. and the 31-year-old also missed out on a podium place in the 4x200m relay as the british quartet came in fourth with russia setting a new world indoor record. there was a setback for jade johnson as she suffered a recurrence of her back injury in the long jump. russia won the meeting with a final total of 63 points with britain second on 48 and france one point behind in third. led the way for russia by producing a major shock in the high jump as he beat olympic champion stefan holm into second place to end the swede s 22-event unbeaten record. won the triple jump with a leap of 16.87m with britain s tosin oke fourth in 15.80m. won the men s pole vault competition with a clearance of 5.65m with britain s nick buckfield 51cm adrift of his personal best in third. and won the women s 800m with britain s jenny meadows third. there was yet another russian victory in the women s 400m as finished well clear of britain s catherine murphy. chris lambert had to settle for fourth after fading in the closing stages of the men s 200m race as sweden s held off leslie djhone of france. france s won the men s 400m with brett rund fourth for britain. took victory for sweden in the women s 60m hurdles ahead of russia s irina shevchenko and britain s sarah claxton who set a new personal best. italy grabbed their first victory in the men s 1500m as kicked over the last 200 metres to hold off britain s james thie and france s alexis abraham. a botched changeover in the 4x200m relay cost britain s men the chance to add further points as france claimed victory.\"]\n", "# sentences_2 = [\"rap boss arrested over drug find rap mogul marion suge knight has been arrested for violating his parole after he was allegedly found with marijuana. he was arrested in barstow california on saturday following an alleged traffic offence. he is expected to be transferred to a state prison while a decision is made on whether he should be released. mr knight founder of death row records served a 10-month jail term in 2004 for punching a man while on parole for an assault conviction. police said mr knight was stopped on saturday after performing an illegal u-turn and a search of his car allegedly found marijuana. he is also accused of not having insurance. a 18-year-old woman in the car was arrested for providing false information and having a fake id card. she was later released. it was his second alleged violation having previously served half of a nine-year sentence for breaking the terms of his parole. mr knight 39 was jailed in october 1996 following his involvement in a fight with a rival gang just hours before rapper tupac shakur was killed in a las vegas drive-by shooting. he was driving shakur s car at the time and was shot in the head. at the time he was on probation for assaulting two musicians. mr knight a former bodyguard set up death row records in the early 1990s with shakur and dr dre among his protegees. but the label has always been dogged by allegations it supports gang culture and fuels the east and west coast rap rivalry.,amnesty chief laments war failure the lack of public outrage about the war on terror is a powerful indictment of the failure of human rights groups amnesty international s chief has said. in a lecture at the london school of economics irene khan said human rights had been flouted in the name of security since 11 september 2001. she said the human rights movement had to use simpler language both to prevent scepticism and spread a moral message. and it had to fight poverty not just focus on political rights for elites. ms khan highlighted detentions without trial including those at the us camp at guantanamo bay in cuba and the abuse of prisoners as evidence of increasing human rights problems. what s a new challenge is the way in which this age-old debate on security and human rights has been translated into the language of war she said. by using the language of war human rights are being sidelined because we know human rights do not apply in times of war. ms khan said such breaches were infectious and were now seen in almost very major country in the world. the human rights movement faces a crisis of faith in the value of human rights she said. that was accompanied by a crisis of governance where the united nations system did not seem able to hold countries to account. the amnesty secretary-general said a growing gap between the perceived influence of human rights group and what they could actually achieve was fuelling scepticism. public passivity on the war against terror is the single most powerful indictment on the failures of human rights groups she said. ms khan said the movement had failed to mobilise public outrage about what was happening to the human rights system. there needed to be a drive to use simpler language talking about the basic morality of the issues rather than the complexity of legal processes. such efforts could make the issues more relevant to people across the world she said. the human rights groups also had to recognise there were new groups which had to be tackled in new ways as power dripped away from state governments. al-qaeda for example was not going to be impressed by a traditional amnesty letter writing campaign. more also needed to be done to develop a human rights framework for international business corporations. amnesty international members voted in 2001 to extend the organisation s work from political and civil rights to cover social and economic rights too. ms khan said the human rights movement would make itself irrelevant if it turned away from the suffering caused by economic strife. we would be an elitist bunch working for the elites for those who cannot read the newspaper of their choice rather than those who cannot read she said. despite her concerns ms khan dubbed herself a hope-monger saying she was confident the passions of the human rights movement could overcome the new challenges.\"]\n", "\n", "\n", "q_embeddings = bge_model.encode(text1, normalize_embeddings=True)\n", "p_embeddings = bge_model.encode(text2, normalize_embeddings=True)\n", "\n", "scores = q_embeddings @ p_embeddings.T\n", "\n", "print(scores)\n", "# cosine_scores = util.cos_sim(q_embeddings, p_embeddings)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XlgJoTkD-EEz", "outputId": "ab62c60e-814e-4235-c1fb-e667edac0ad8" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.6106738\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "5QJIuG2MgGtw" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def similarity_score(model, textA, textB):\n", " em_test = model.encode(\n", " [textA, textB],\n", " normalize_embeddings=True\n", " )\n", " return em_test[0] @ em_test[1].T" ], "metadata": { "id": "I1dWDz3RgcQM" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "similarity_score(bge_model, text1, text2)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YXLz8mMrhs76", "outputId": "e4167f7b-b24a-4c9c-a10c-f4ec3d0da6cf" }, "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.61067384" ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "XfjLm9M13Wns" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "# all-MiniLM" ], "metadata": { "id": "zFswlUR5z5sq" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sentence_transformers import SentenceTransformer, util\n", "\n", "mini_lmmodel = SentenceTransformer('all-MiniLM-L6-v2')" ], "metadata": { "id": "dCdFYtYXrN9x" }, "execution_count": 14, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "\n", "# Two lists of sentences\n", "# sentences_1 = [\"broadband challenges tv viewing the number of europeans with broadband has exploded over the past 12 months with the web eating into tv viewing habits research suggests. just over 54 million people are hooked up to the net via broadband up from 34 million a year ago according to market analysts nielsen/netratings. the total number of people online in europe has broken the 100 million mark. the popularity of the net has meant that many are turning away from tv say analysts jupiter research. it found that a quarter of web users said they spent less time watching tv in favour of the net the report by nielsen/netratings found that the number of people with fast internet access had risen by 60% over the past year. the biggest jump was in italy where it rose by 120%. britain was close behind with broadband users almost doubling in a year. the growth has been fuelled by lower prices and a wider choice of always-on fast-net subscription plans. twelve months ago high speed internet users made up just over one third of the audience in europe; now they are more than 50% and we expect this number to keep growing said gabrielle prior nielsen/netratings analyst. as the number of high-speed surfers grows websites will need to adapt update and enhance their content to retain their visitors and encourage new ones. the total number of europeans online rose by 12% to 100 million over the past year the report showed with the biggest rise in france italy britain and germany. the ability to browse web pages at high speed download files such as music or films and play online games is changing what people do in their spare time. a study by analysts jupiter research suggested that broadband was challenging television viewing habits. in homes with broadband 40% said they were spending less time watching tv. the threat to tv was greatest in countries where broadband was on the up in particular the uk france and spain said the report. it said tv companies faced a major long-term threat over the next five years with broadband predicted to grow from 19% to 37% of households by 2009. year-on-year we are continuing to see a seismic shift in where when and how europe s population consume media for information and entertainment and this has big implications for tv newspaper and radio said jupiter research analyst olivier beauvillian.,gardener wins double in glasgow britain s jason gardener enjoyed a double 60m success in glasgow in his first competitive outing since he won 100m relay gold at the athens olympics. gardener cruised home ahead of scot nick smith to win the invitational race at the norwich union international. he then recovered from a poor start in the second race to beat swede daniel persson and italy s luca verdecchia. his times of 6.61 and 6.62 seconds were well short of american maurice greene s 60m world record of 6.39secs from 1998. it s a very hard record to break but i believe i ve trained very well said the world indoor champion who hopes to get closer to the mark this season. it was important to come out and make sure i got maximum points. my last race was the olympic final and there was a lot of expectation. this was just what i needed to sharpen up and get some race fitness. i m very excited about the next couple of months. double olympic champion marked her first appearance on home soil since winning 1500m and 800m gold in athens with a victory. there was a third success for britain when edged out russia s olga fedorova and sweden s jenny kallur to win the women s 60m race in 7.23secs. maduaka was unable to repeat the feat in the 200m finishing down in fourth as took the win for russia. and the 31-year-old also missed out on a podium place in the 4x200m relay as the british quartet came in fourth with russia setting a new world indoor record. there was a setback for jade johnson as she suffered a recurrence of her back injury in the long jump. russia won the meeting with a final total of 63 points with britain second on 48 and france one point behind in third. led the way for russia by producing a major shock in the high jump as he beat olympic champion stefan holm into second place to end the swede s 22-event unbeaten record. won the triple jump with a leap of 16.87m with britain s tosin oke fourth in 15.80m. won the men s pole vault competition with a clearance of 5.65m with britain s nick buckfield 51cm adrift of his personal best in third. and won the women s 800m with britain s jenny meadows third. there was yet another russian victory in the women s 400m as finished well clear of britain s catherine murphy. chris lambert had to settle for fourth after fading in the closing stages of the men s 200m race as sweden s held off leslie djhone of france. france s won the men s 400m with brett rund fourth for britain. took victory for sweden in the women s 60m hurdles ahead of russia s irina shevchenko and britain s sarah claxton who set a new personal best. italy grabbed their first victory in the men s 1500m as kicked over the last 200 metres to hold off britain s james thie and france s alexis abraham. a botched changeover in the 4x200m relay cost britain s men the chance to add further points as france claimed victory.\"]\n", "# sentences_2 = [\"rap boss arrested over drug find rap mogul marion suge knight has been arrested for violating his parole after he was allegedly found with marijuana. he was arrested in barstow california on saturday following an alleged traffic offence. he is expected to be transferred to a state prison while a decision is made on whether he should be released. mr knight founder of death row records served a 10-month jail term in 2004 for punching a man while on parole for an assault conviction. police said mr knight was stopped on saturday after performing an illegal u-turn and a search of his car allegedly found marijuana. he is also accused of not having insurance. a 18-year-old woman in the car was arrested for providing false information and having a fake id card. she was later released. it was his second alleged violation having previously served half of a nine-year sentence for breaking the terms of his parole. mr knight 39 was jailed in october 1996 following his involvement in a fight with a rival gang just hours before rapper tupac shakur was killed in a las vegas drive-by shooting. he was driving shakur s car at the time and was shot in the head. at the time he was on probation for assaulting two musicians. mr knight a former bodyguard set up death row records in the early 1990s with shakur and dr dre among his protegees. but the label has always been dogged by allegations it supports gang culture and fuels the east and west coast rap rivalry.,amnesty chief laments war failure the lack of public outrage about the war on terror is a powerful indictment of the failure of human rights groups amnesty international s chief has said. in a lecture at the london school of economics irene khan said human rights had been flouted in the name of security since 11 september 2001. she said the human rights movement had to use simpler language both to prevent scepticism and spread a moral message. and it had to fight poverty not just focus on political rights for elites. ms khan highlighted detentions without trial including those at the us camp at guantanamo bay in cuba and the abuse of prisoners as evidence of increasing human rights problems. what s a new challenge is the way in which this age-old debate on security and human rights has been translated into the language of war she said. by using the language of war human rights are being sidelined because we know human rights do not apply in times of war. ms khan said such breaches were infectious and were now seen in almost very major country in the world. the human rights movement faces a crisis of faith in the value of human rights she said. that was accompanied by a crisis of governance where the united nations system did not seem able to hold countries to account. the amnesty secretary-general said a growing gap between the perceived influence of human rights group and what they could actually achieve was fuelling scepticism. public passivity on the war against terror is the single most powerful indictment on the failures of human rights groups she said. ms khan said the movement had failed to mobilise public outrage about what was happening to the human rights system. there needed to be a drive to use simpler language talking about the basic morality of the issues rather than the complexity of legal processes. such efforts could make the issues more relevant to people across the world she said. the human rights groups also had to recognise there were new groups which had to be tackled in new ways as power dripped away from state governments. al-qaeda for example was not going to be impressed by a traditional amnesty letter writing campaign. more also needed to be done to develop a human rights framework for international business corporations. amnesty international members voted in 2001 to extend the organisation s work from political and civil rights to cover social and economic rights too. ms khan said the human rights movement would make itself irrelevant if it turned away from the suffering caused by economic strife. we would be an elitist bunch working for the elites for those who cannot read the newspaper of their choice rather than those who cannot read she said. despite her concerns ms khan dubbed herself a hope-monger saying she was confident the passions of the human rights movement could overcome the new challenges.\"]\n", "\n", "\n", "#Compute embedding for both lists\n", "embeddings1 = mini_lmmodel.encode(text1, convert_to_tensor=True)\n", "embeddings2 = mini_lmmodel.encode(text2, convert_to_tensor=True)\n", "\n", "#Compute cosine-similarities\n", "cosine_scores = util.cos_sim(embeddings1, embeddings2)\n", "\n", "#Output the pairs with their score\n", "print(cosine_scores)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nWHGRzvD0BVY", "outputId": "cca27611-43be-4d69-d29b-6aa3262fb630" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "tensor([[0.4019]], device='cuda:0')\n" ] } ] }, { "cell_type": "code", "source": [ "def similarity_score(model, textA, textB):\n", " em_test = model.encode(\n", " [textA, textB],\n", " normalize_embeddings=True\n", " )\n", " return em_test[0] @ em_test[1].T" ], "metadata": { "id": "Goxit0uRoNGZ" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "similarity_score(mini_lmmodel, text1, text2)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b3Seh0cgoM1c", "outputId": "aa167a51-b6b2-4eb0-a315-a7124d814b8f" }, "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.4019029" ] }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "rMK_Ihj9r10q" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# using tfidf vectorizer" ], "metadata": { "id": "MWbnSGF50I5n" }, "execution_count": 18, "outputs": [] }, { "cell_type": "code", "source": [ "import nltk, string\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "nltk.download('punkt') # if necessary...\n", "\n", "\n", "stemmer = nltk.stem.porter.PorterStemmer()\n", "remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)\n", "\n", "def stem_tokens(tokens):\n", " return [stemmer.stem(item) for item in tokens]\n", "\n", "'''remove punctuation, lowercase, stem'''\n", "def normalize(text):\n", " return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))\n", "\n", "vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')\n", "\n", "def cosine_sim(text1, text2):\n", " tfidf = vectorizer.fit_transform([text1, text2])\n", " return ((tfidf * tfidf.T).A)[0,1]\n", "\n", "# print(cosine_sim('a little bird', 'a little bird chirps'))\n", "# print(cosine_sim('a little bird', 'a big dog barks'))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yQ0kDsT01BIt", "outputId": "b8ff5a07-ca1d-4983-b288-cb1054bc4907" }, "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Unzipping tokenizers/punkt.zip.\n" ] } ] }, { "cell_type": "code", "source": [ "print(cosine_sim(text1, text2))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2LUjIWt6sBCd", "outputId": "32ef70a4-2964-48c5-9189-a1602bec6253" }, "execution_count": 25, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.10445100436925958\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "4opltShysBOE" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# using gensim word2vec model" ], "metadata": { "id": "BFXUUBqxoWYe" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from gensim.models import Word2Vec\n", "from gensim.utils import simple_preprocess\n", "import numpy as np\n", "\n", "# Define the calculate_similarity function\n", "def calculate_similarity(sentence1, sentence2):\n", " # Tokenize the sentences\n", " tokens1 = simple_preprocess(sentence1)\n", " tokens2 = simple_preprocess(sentence2)\n", "\n", " # Load or train a Word2Vec model\n", " # Here, we'll create a simple model for demonstration purposes\n", " sentences = [tokens1, tokens2]\n", " model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)\n", "\n", " # Calculate the vector representation for each sentence\n", " vector1 = np.mean([model.wv[token] for token in tokens1], axis=0)\n", " vector2 = np.mean([model.wv[token] for token in tokens2], axis=0)\n", "\n", " # Calculate cosine similarity\n", " similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))\n", " return similarity\n", "\n" ], "metadata": { "id": "IM3zMLoV1aG8" }, "execution_count": 22, "outputs": [] }, { "cell_type": "code", "source": [ "similarity_score = calculate_similarity(text1, text2)\n", "print(\"Similarity score:\", similarity_score)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_NI82qHaz55l", "outputId": "eea78aa2-83fc-40f2-e9eb-4ac8d348dca4" }, "execution_count": 39, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Similarity score: 0.7974768\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "XxEsgqtVsqo_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "mqCm8ZAssq4k" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "2bmxYME6srKL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# process data for BGE and w2v embeddings" ], "metadata": { "id": "GNdqPTE4srYg" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "k8a9yQpCvgQ1" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def similarity_score(model, textA, textB):\n", " em_test = model.encode(\n", " [textA, textB],\n", " normalize_embeddings=True\n", " )\n", " return em_test[0] @ em_test[1].T" ], "metadata": { "id": "JEirwptTtmnT" }, "execution_count": 28, "outputs": [] }, { "cell_type": "code", "source": [ "def get_similar_score_w2v(data_set):\n", " text1 = data_set['text1']\n", " text1 = data_set['text1']\n", " score = calculate_similarity(text1, text2)\n", " return {\n", " 'w2v_similar_score': score\n", " }" ], "metadata": { "id": "A2xHJHpeyQCt" }, "execution_count": 40, "outputs": [] }, { "cell_type": "code", "source": [ "def get_similar_score_bge(data_set):\n", " text1 = data_set['text1']\n", " text1 = data_set['text1']\n", " score = similarity_score(bge_model, text1, text2)\n", " return {\n", " 'bge_similar_score': score\n", " }" ], "metadata": { "id": "VeMOB82DtO4X" }, "execution_count": 29, "outputs": [] }, { "cell_type": "code", "source": [ "from datasets import load_dataset\n", "\n", "split_dataset = load_dataset(\"csv\", data_files=\"/content/Precily_Text_Similarity.csv\", split='train[:100]')" ], "metadata": { "id": "4xVAmvhmvjZI" }, "execution_count": 32, "outputs": [] }, { "cell_type": "code", "source": [ "score_dataset = split_dataset.map(get_similar_score_bge, num_proc=2)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 49, "referenced_widgets": [ "72edeed1577a4280978f8d7da3a6e872", "3c1528dba4bc41bb9ea0067b6598dac8", "52290dce6801444584f21fe703b47547", "7fea5e318d5c403889c80b83d86c8abe", "1aae22882ab942efb2092344505b04cc", "5be8ca37e58540cc800728d468bdef88", "835a2a6310af4c37b8d5e036ae61525e", "af5313b767db44009c5f96b2825acf31", "a7b5b13b76bd40fca27673b87628a102", "7e0671da065c4a2aa75f8bd7e8bf688f", "5d348e89e0524852a451dd63067fff08" ] }, "id": "aq9BvUP2tKq0", "outputId": "4bc0115c-b8e8-4647-eae5-37f321762857" }, "execution_count": 33, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Map (num_proc=2): 0%| | 0/100 [00:00, ? examples/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "72edeed1577a4280978f8d7da3a6e872" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "final_score_dataset = score_dataset.map(get_similar_score_w2v, num_proc=2)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 49, "referenced_widgets": [ "179c516cf716481a98baa336461296ad", "a7a5f724aceb4ae787ccf0d787f101d0", "ee53c5ec1ef9460881772bfba3b5dde4", "9b4b42d5d8504f4ab03dd6419cbe2fb9", "3afc58c0610b44a996fbdf42bfeb1c9e", "7db80f9601494ba48db92afa822d21ca", "49c953d09fe5499798f9a200a8f61574", "6fb7add448204aa4a9baab2bf87e99ae", "fca07894b1eb48b49e4421e251724886", "42360ce02ae0463a90cd06aff246284c", "72b29dbbdb264a7890674316a53549ce" ] }, "id": "xxAWm50NymvD", "outputId": "031c922c-f786-4fd9-9e0a-88dbee428307" }, "execution_count": 41, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Map (num_proc=2): 0%| | 0/100 [00:00, ? examples/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "179c516cf716481a98baa336461296ad" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "final_score_dataset.to_pandas()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "BEA32DrxuHNY", "outputId": "3de198ff-74e0-4940-e512-1e1eb878cbc7" }, "execution_count": 42, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " text1 \\\n", "0 broadband challenges tv viewing the number of ... \n", "1 rap boss arrested over drug find rap mogul mar... \n", "2 player burn-out worries robinson england coach... \n", "3 hearts of oak 3-2 cotonsport hearts of oak set... \n", "4 sir paul rocks super bowl crowds sir paul mcca... \n", ".. ... \n", "95 quiksilver moves for rossignol shares of skis ... \n", "96 electrolux to export europe jobs electrolux sa... \n", "97 clarke to press on with id cards new home secr... \n", "98 moody joins up with england lewis moody has fl... \n", "99 controversial film tops festival a controversi... \n", "\n", " text2 bge_similar_score \\\n", "0 gardener wins double in glasgow britain s jaso... 0.454750 \n", "1 amnesty chief laments war failure the lack of ... 0.452746 \n", "2 hanks greeted at wintry premiere hollywood sta... 0.579094 \n", "3 redford s vision of sundance despite sporting ... 0.498436 \n", "4 mauresmo opens with victory in la amelie maure... 0.439796 \n", ".. ... ... \n", "95 isinbayeva claims new world best pole vaulter ... 0.549550 \n", "96 director nair s vanity project indian film dir... 0.427827 \n", "97 gb quartet get cross country call four british... 0.403946 \n", "98 my memories of marley... to mark the 60th ann... 0.536789 \n", "99 strachan turns down pompey former southampton ... 0.470241 \n", "\n", " w2v_similar_score \n", "0 0.762984 \n", "1 0.564779 \n", "2 0.729674 \n", "3 0.803849 \n", "4 0.706064 \n", ".. ... \n", "95 0.649987 \n", "96 0.637123 \n", "97 0.810142 \n", "98 0.746907 \n", "99 0.702861 \n", "\n", "[100 rows x 4 columns]" ], "text/html": [ "\n", "
\n", " | text1 | \n", "text2 | \n", "bge_similar_score | \n", "w2v_similar_score | \n", "
---|---|---|---|---|
0 | \n", "broadband challenges tv viewing the number of ... | \n", "gardener wins double in glasgow britain s jaso... | \n", "0.454750 | \n", "0.762984 | \n", "
1 | \n", "rap boss arrested over drug find rap mogul mar... | \n", "amnesty chief laments war failure the lack of ... | \n", "0.452746 | \n", "0.564779 | \n", "
2 | \n", "player burn-out worries robinson england coach... | \n", "hanks greeted at wintry premiere hollywood sta... | \n", "0.579094 | \n", "0.729674 | \n", "
3 | \n", "hearts of oak 3-2 cotonsport hearts of oak set... | \n", "redford s vision of sundance despite sporting ... | \n", "0.498436 | \n", "0.803849 | \n", "
4 | \n", "sir paul rocks super bowl crowds sir paul mcca... | \n", "mauresmo opens with victory in la amelie maure... | \n", "0.439796 | \n", "0.706064 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
95 | \n", "quiksilver moves for rossignol shares of skis ... | \n", "isinbayeva claims new world best pole vaulter ... | \n", "0.549550 | \n", "0.649987 | \n", "
96 | \n", "electrolux to export europe jobs electrolux sa... | \n", "director nair s vanity project indian film dir... | \n", "0.427827 | \n", "0.637123 | \n", "
97 | \n", "clarke to press on with id cards new home secr... | \n", "gb quartet get cross country call four british... | \n", "0.403946 | \n", "0.810142 | \n", "
98 | \n", "moody joins up with england lewis moody has fl... | \n", "my memories of marley... to mark the 60th ann... | \n", "0.536789 | \n", "0.746907 | \n", "
99 | \n", "controversial film tops festival a controversi... | \n", "strachan turns down pompey former southampton ... | \n", "0.470241 | \n", "0.702861 | \n", "
100 rows × 4 columns
\n", "