brendon-ai commited on
Commit
e7ee502
·
verified ·
1 Parent(s): 3cf5704

Update src/kaggle_loader.py

Browse files
Files changed (1) hide show
  1. src/kaggle_loader.py +240 -0
src/kaggle_loader.py CHANGED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import json
4
+ from typing import List, Optional
5
+ from langchain_core.documents import Document
6
+ from langchain_community.document_loaders import CSVLoader, JSONLoader
7
+ import kaggle
8
+
9
+ class KaggleDataLoader:
10
+ """Load and process Kaggle datasets for RAG."""
11
+
12
+ def __init__(self, kaggle_username: Optional[str] = None, kaggle_key: Optional[str] = None):
13
+ """
14
+ Initialize Kaggle loader.
15
+
16
+ Args:
17
+ kaggle_username: Your Kaggle username (optional if using kaggle.json)
18
+ kaggle_key: Your Kaggle API key (optional if using kaggle.json)
19
+ """
20
+ self.kaggle_username = kaggle_username
21
+ self.kaggle_key = kaggle_key
22
+
23
+ # Try to load credentials from kaggle.json first
24
+ self._load_kaggle_credentials()
25
+
26
+ # Set Kaggle credentials (either from kaggle.json or parameters)
27
+ if self.kaggle_username and self.kaggle_key:
28
+ os.environ['KAGGLE_USERNAME'] = self.kaggle_username
29
+ os.environ['KAGGLE_KEY'] = self.kaggle_key
30
+ print("Kaggle credentials loaded successfully")
31
+ else:
32
+ print("Warning: No Kaggle credentials found. Please set up kaggle.json or provide credentials.")
33
+
34
+ def _load_kaggle_credentials(self):
35
+ """Load Kaggle credentials from kaggle.json file."""
36
+ # Common locations for kaggle.json
37
+ possible_paths = [
38
+ os.path.expanduser("~/.kaggle/kaggle.json"),
39
+ os.path.expanduser("~/kaggle.json"),
40
+ "./kaggle.json",
41
+ os.path.join(os.getcwd(), "kaggle.json")
42
+ ]
43
+
44
+ for path in possible_paths:
45
+ if os.path.exists(path):
46
+ try:
47
+ with open(path, 'r') as f:
48
+ credentials = json.load(f)
49
+
50
+ # Extract username and key from kaggle.json
51
+ if 'username' in credentials and 'key' in credentials:
52
+ self.kaggle_username = credentials['username']
53
+ self.kaggle_key = credentials['key']
54
+ print(f"Loaded Kaggle credentials from {path}")
55
+ return
56
+ else:
57
+ print(f"Invalid kaggle.json format at {path}. Expected 'username' and 'key' fields.")
58
+
59
+ except Exception as e:
60
+ print(f"Error reading kaggle.json from {path}: {e}")
61
+
62
+ print("No valid kaggle.json found in common locations:")
63
+ for path in possible_paths:
64
+ print(f" - {path}")
65
+ print("Please create kaggle.json with your Kaggle API credentials.")
66
+
67
+ def download_dataset(self, dataset_name: str, download_path: str = "./data") -> str:
68
+ """
69
+ Download a Kaggle dataset.
70
+
71
+ Args:
72
+ dataset_name: Dataset name in format 'username/dataset-name'
73
+ download_path: Where to save the dataset
74
+
75
+ Returns:
76
+ Path to downloaded dataset
77
+ """
78
+ if not self.kaggle_username or not self.kaggle_key:
79
+ raise ValueError("Kaggle credentials not found. Please set up kaggle.json or provide credentials.")
80
+
81
+ try:
82
+ # Create a unique directory for this dataset
83
+ dataset_dir = dataset_name.replace('/', '_')
84
+ full_download_path = os.path.join(download_path, dataset_dir)
85
+
86
+ # Create the directory if it doesn't exist
87
+ os.makedirs(full_download_path, exist_ok=True)
88
+
89
+ kaggle.api.authenticate()
90
+ kaggle.api.dataset_download_files(dataset_name, path=full_download_path, unzip=True)
91
+ print(f"Dataset {dataset_name} downloaded successfully to {full_download_path}")
92
+ return full_download_path
93
+ except Exception as e:
94
+ print(f"Error downloading dataset: {e}")
95
+ raise
96
+
97
+ def load_csv_dataset(self, file_path: str, text_columns: List[str], chunk_size: int = 100) -> List[Document]:
98
+ """Load documents from a CSV file."""
99
+ try:
100
+ df = pd.read_csv(file_path)
101
+ documents = []
102
+
103
+ # For FAQ datasets, try to combine question and answer columns
104
+ if 'Questions' in df.columns and 'Answers' in df.columns:
105
+ print(f"Processing FAQ dataset with {len(df)} Q&A pairs")
106
+ for idx, row in df.iterrows():
107
+ question = str(row['Questions']).strip()
108
+ answer = str(row['Answers']).strip()
109
+
110
+ # Create a document with question prominently featured for better retrieval
111
+ content = f"QUESTION: {question}\n\nANSWER: {answer}"
112
+ documents.append(Document(
113
+ page_content=content,
114
+ metadata={"source": file_path, "type": "faq", "question_id": idx, "question": question}
115
+ ))
116
+ else:
117
+ # Fallback to original method for other CSV files
118
+ print(f"Processing regular CSV with columns: {text_columns}")
119
+ for idx, row in df.iterrows():
120
+ # Combine specified text columns
121
+ text_parts = []
122
+ for col in text_columns:
123
+ if col in df.columns and pd.notna(row[col]):
124
+ text_parts.append(str(row[col]).strip())
125
+
126
+ if text_parts:
127
+ content = " ".join(text_parts)
128
+ documents.append(Document(
129
+ page_content=content,
130
+ metadata={"source": file_path, "row": idx}
131
+ ))
132
+
133
+ print(f"Created {len(documents)} documents from CSV")
134
+ return documents
135
+
136
+ except Exception as e:
137
+ print(f"Error loading CSV dataset: {e}")
138
+ return []
139
+
140
+ def load_json_dataset(self, file_path: str, text_field: str = "text",
141
+ metadata_fields: Optional[List[str]] = None) -> List[Document]:
142
+ """
143
+ Load JSON data and convert to documents.
144
+
145
+ Args:
146
+ file_path: Path to JSON file
147
+ text_field: Field name containing the main text
148
+ metadata_fields: Fields to include as metadata
149
+
150
+ Returns:
151
+ List of Document objects
152
+ """
153
+ with open(file_path, 'r') as f:
154
+ data = json.load(f)
155
+
156
+ documents = []
157
+
158
+ for item in data:
159
+ text_content = item.get(text_field, "")
160
+
161
+ # Create metadata
162
+ metadata = {"source": file_path}
163
+ if metadata_fields:
164
+ for field in metadata_fields:
165
+ if field in item:
166
+ metadata[field] = item[field]
167
+
168
+ documents.append(Document(
169
+ page_content=text_content,
170
+ metadata=metadata
171
+ ))
172
+
173
+ return documents
174
+
175
+ def load_text_dataset(self, file_path: str, chunk_size: int = 1000) -> List[Document]:
176
+ """
177
+ Load plain text data and convert to documents.
178
+
179
+ Args:
180
+ file_path: Path to text file
181
+ chunk_size: Number of characters per document
182
+
183
+ Returns:
184
+ List of Document objects
185
+ """
186
+ with open(file_path, 'r', encoding='utf-8') as f:
187
+ text = f.read()
188
+
189
+ documents = []
190
+
191
+ for i in range(0, len(text), chunk_size):
192
+ chunk = text[i:i+chunk_size]
193
+
194
+ documents.append(Document(
195
+ page_content=chunk,
196
+ metadata={
197
+ "source": file_path,
198
+ "chunk_id": i // chunk_size,
199
+ "start_char": i,
200
+ "end_char": min(i + chunk_size, len(text))
201
+ }
202
+ ))
203
+
204
+ return documents
205
+
206
+ # Example usage functions
207
+ def load_kaggle_csv_example():
208
+ """Example: Load a CSV dataset from Kaggle."""
209
+ # Initialize loader (replace with your credentials)
210
+ loader = KaggleDataLoader("your_username", "your_api_key")
211
+
212
+ # Download dataset (example: COVID-19 dataset)
213
+ dataset_path = loader.download_dataset("gpreda/covid-world-vaccination-progress")
214
+
215
+ # Load CSV data
216
+ csv_file = os.path.join(dataset_path, "country_vaccinations.csv")
217
+ documents = loader.load_csv_dataset(
218
+ csv_file,
219
+ text_columns=["country", "vaccines", "source_name"],
220
+ chunk_size=100
221
+ )
222
+
223
+ return documents
224
+
225
+ def load_kaggle_json_example():
226
+ """Example: Load a JSON dataset from Kaggle."""
227
+ loader = KaggleDataLoader("your_username", "your_api_key")
228
+
229
+ # Download dataset (example: news articles)
230
+ dataset_path = loader.download_dataset("rmisra/news-category-dataset")
231
+
232
+ # Load JSON data
233
+ json_file = os.path.join(dataset_path, "News_Category_Dataset_v3.json")
234
+ documents = loader.load_json_dataset(
235
+ json_file,
236
+ text_field="headline",
237
+ metadata_fields=["category", "date"]
238
+ )
239
+
240
+ return documents