RobertoBarrosoLuque commited on
Commit
4818cbb
·
1 Parent(s): 360f329

Data exploration ready

Browse files
.pre-commit-config.yaml CHANGED
@@ -40,9 +40,3 @@ repos:
40
  hooks:
41
  - id: black
42
  args: ["--target-version", "py311"]
43
-
44
- - repo: https://github.com/Yelp/detect-secrets
45
- rev: v1.5.0
46
- hooks:
47
- - id: detect-secrets
48
- exclude: ^(graphql-mock/pnpm-lock\.yaml|.*\.ipynb)$
 
40
  hooks:
41
  - id: black
42
  args: ["--target-version", "py311"]
 
 
 
 
 
 
notebooks/eda-and-fine-tuning.ipynb ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "0",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from datasets import load_dataset\n",
11
+ "from sklearn.model_selection import train_test_split\n",
12
+ "from PIL import Image\n",
13
+ "import io\n",
14
+ "import matplotlib.pyplot as plt"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "markdown",
19
+ "id": "1",
20
+ "metadata": {},
21
+ "source": [
22
+ "## Understand the data and split into train and test\n",
23
+ "1. Shape of dataset\n",
24
+ "2. Distribution / balance of categories\n",
25
+ "3. Train-test split"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": null,
31
+ "id": "2",
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "ds =load_dataset(\"ceyda/fashion-products-small\")\n",
36
+ "df = ds['train'].to_pandas()\n",
37
+ "print(f\"Shape of dataset: {df.shape}\")"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": null,
43
+ "id": "3",
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "### For expediency we will randomly sample only 10,000 total rows\n",
48
+ "sample_size = 10000\n",
49
+ "df = df.sample(n=sample_size, random_state=42)\n",
50
+ "print(f\"Shape of dataset after sampling: {df.shape}\")"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "id": "4",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "def get_category_distribution_by_percent(df, col):\n",
61
+ " count_df = df.groupby(col)[\"id\"].count().reset_index(name=\"count\")\n",
62
+ " _denominator = df.shape[0]\n",
63
+ " count_df.loc[:, \"percent\"] = (count_df[\"count\"] / _denominator) * 100\n",
64
+ " return count_df.sort_values(by=\"percent\", ascending=False)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "id": "5",
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "m_cat = get_category_distribution_by_percent(df, \"masterCategory\")\n",
75
+ "m_cat"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "id": "6",
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "get_category_distribution_by_percent(df, \"subCategory\")"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "id": "7",
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "get_category_distribution_by_percent(df, \"gender\")"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "markdown",
100
+ "id": "8",
101
+ "metadata": {},
102
+ "source": [
103
+ "As seen above the dataset is imbalanced, especially around masterCategory. Lets filter out any masterCategory with less than 2% of the dataset"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "id": "9",
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "cat_less_than_2_percent = m_cat.loc[m_cat.loc[:, \"percent\"] < 2, \"masterCategory\"].values\n",
114
+ "print(f\"Starting with {df.shape}\")\n",
115
+ "df = df.loc[~df.loc[:, \"masterCategory\"].isin(cat_less_than_2_percent)]\n",
116
+ "print(f\"Finished with {df.shape}\")"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": null,
122
+ "id": "10",
123
+ "metadata": {},
124
+ "outputs": [],
125
+ "source": [
126
+ "df_train, df_test = train_test_split(df, test_size=0.15, random_state=42)\n",
127
+ "print(f\"Train shape: {df_train.shape}\")\n",
128
+ "print(f\"Test shape: {df_test.shape}\")"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": null,
134
+ "id": "11",
135
+ "metadata": {},
136
+ "outputs": [],
137
+ "source": [
138
+ "## Save datasets in /data folder\n",
139
+ "df_train.to_csv(\"../data/train.csv\", index=False)\n",
140
+ "df_test.to_csv(\"../data/test.csv\", index=False)"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "markdown",
145
+ "id": "12",
146
+ "metadata": {},
147
+ "source": [
148
+ "## Fine tuning"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": null,
154
+ "id": "13",
155
+ "metadata": {},
156
+ "outputs": [],
157
+ "source": [
158
+ "import base64\n",
159
+ "from io import BytesIO\n",
160
+ "\n",
161
+ "def pil_to_base64(pil_image):\n",
162
+ " \"\"\"Convert PIL Image to base64 string\"\"\"\n",
163
+ " buffered = BytesIO()\n",
164
+ " pil_image.save(buffered, format=\"PNG\")\n",
165
+ " img_str = base64.b64encode(buffered.getvalue()).decode()\n",
166
+ " return f\"data:image/png;base64,{img_str}\""
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": null,
172
+ "id": "14",
173
+ "metadata": {},
174
+ "outputs": [],
175
+ "source": [
176
+ "img_bytes = df_train['image'][0]['bytes']\n",
177
+ "img = Image.open(io.BytesIO(img_bytes))\n",
178
+ "plt.imshow(img)\n",
179
+ "plt.axis('off')\n",
180
+ "plt.title(ds['train'][0].get('productDisplayName', 'Product'))\n",
181
+ "plt.show()"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "markdown",
186
+ "id": "15",
187
+ "metadata": {},
188
+ "source": [
189
+ "Convert dataset to Fireworks jsonl as specified in [the docs](https://fireworks.ai/docs/fine-tuning/fine-tuning-vlm#supervised-fine-tuning-for-vlms-sft)"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": null,
195
+ "id": "16",
196
+ "metadata": {},
197
+ "outputs": [],
198
+ "source": [
199
+ "df_train.columns"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": null,
205
+ "id": "17",
206
+ "metadata": {},
207
+ "outputs": [],
208
+ "source": []
209
+ }
210
+ ],
211
+ "metadata": {
212
+ "kernelspec": {
213
+ "display_name": "Python 3",
214
+ "language": "python",
215
+ "name": "python3"
216
+ },
217
+ "language_info": {
218
+ "codemirror_mode": {
219
+ "name": "ipython",
220
+ "version": 2
221
+ },
222
+ "file_extension": ".py",
223
+ "mimetype": "text/x-python",
224
+ "name": "python",
225
+ "nbconvert_exporter": "python",
226
+ "pygments_lexer": "ipython2",
227
+ "version": "2.7.6"
228
+ }
229
+ },
230
+ "nbformat": 4,
231
+ "nbformat_minor": 5
232
+ }
requirements.txt CHANGED
@@ -2,3 +2,8 @@ huggingface_hub==0.34.3
2
  fireworks-ai==0.19.18
3
  gradio==5.42.0
4
  python-dotenv==1.0.0
 
 
 
 
 
 
2
  fireworks-ai==0.19.18
3
  gradio==5.42.0
4
  python-dotenv==1.0.0
5
+ ipython
6
+ scikit-learn
7
+ jupyter
8
+ altair
9
+ matplotlib
src/preprocessing/data_processing.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from io import BytesIO
3
+
4
+
5
+ def pil_to_base64(pil_image):
6
+ """Convert PIL Image to base64 string"""
7
+ buffered = BytesIO()
8
+ pil_image.save(buffered, format="PNG")
9
+ img_str = base64.b64encode(buffered.getvalue()).decode()
10
+ return f"data:image/png;base64,{img_str}"
11
+
12
+
13
+ def image_to_base64(img_bytes):
14
+ """Convert image bytes to base64 string with MIME type"""
15
+ if isinstance(img_bytes, dict) and "bytes" in img_bytes:
16
+ img_bytes = img_bytes["bytes"]
17
+
18
+ # Encode to base64
19
+ b64_string = base64.b64encode(img_bytes).decode("utf-8")
20
+ return f"data:image/jpeg;base64,{b64_string}"
21
+
22
+
23
+ def create_training_example(row):
24
+ """Create a training example with both classification and description tasks"""
25
+
26
+ # Convert image to base64
27
+ img_b64 = image_to_base64(row["image"])
28
+
29
+ # Create multi-task prompt combining classification and description
30
+ user_prompt = "Analyze this fashion product image and provide: 1) Master category, 2) Gender, 3) Sub-category, and 4) A detailed description."
31
+
32
+ # Create structured response with all classification info
33
+ assistant_response = f"""
34
+ Master Category: {row['masterCategory']}
35
+ Gender: {row['gender']}
36
+ Sub-category: {row['subCategory']}
37
+
38
+ Description: This is a {row['gender'].lower()} {row['subCategory'].lower()} from the {row['masterCategory'].lower()} category."""
39
+
40
+ # Format as OpenAI-compatible messages
41
+ return {
42
+ "messages": [
43
+ {
44
+ "role": "system",
45
+ "content": "You are a fashion product analyst. Classify products and generate detailed descriptions based on images.",
46
+ },
47
+ {
48
+ "role": "user",
49
+ "content": [
50
+ {"type": "image_url", "image_url": {"url": img_b64}},
51
+ {"type": "text", "text": user_prompt},
52
+ ],
53
+ },
54
+ {"role": "assistant", "content": assistant_response},
55
+ ]
56
+ }