Spaces:

sayedM
/

rag_codebase

Running

App Files Files Community

rag_codebase / create_faiss.py

sayedM

Upload 9 files

f9618f8 verified 18 days ago

raw

history blame contribute delete

5.09 kB

	import json
	import faiss
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from typing import List, Dict, Any

	def create_code_vector_db(json_file_path: str, model_name: str, output_index_path: str, output_metadata_path: str):
	"""
	Loads code chunks, filters them, generates embeddings, and saves a FAISS index
	along with corresponding metadata.

	Args:
	json_file_path (str): Path to the code_chunks.json file.
	model_name (str): The name of the SentenceTransformer model to use.
	output_index_path (str): Path to save the FAISS index file.
	output_metadata_path (str): Path to save the chunk metadata JSON file.
	"""
	# 1. Load and Filter Chunks
	print(f"Loading chunks from '{json_file_path}'...")
	try:
	with open(json_file_path, 'r', encoding='utf-8') as f:
	all_chunks = json.load(f)
	except FileNotFoundError:
	print(f"Error: The file '{json_file_path}' was not found.")
	return

	# Filter for chunks that contain meaningful semantic information for a RAG system
	target_types = {'function', 'class', 'method', 'async_function', 'async_method'}
	filtered_chunks = [chunk for chunk in all_chunks if chunk.get('chunk_type') in target_types]

	if not filtered_chunks:
	print("No chunks of target types found. Exiting.")
	return

	print(f"Filtered chunks: Kept {len(filtered_chunks)} out of {len(all_chunks)} total chunks.")

	# 2. Prepare Text for Embedding
	# Combine code with metadata for richer semantic representation.
	texts_to_embed = []
	for chunk in filtered_chunks:
	# A good practice is to create a descriptive text for each chunk
	docstring = chunk.get('docstring', '') or "No docstring."
	name = chunk.get('name', '')
	chunk_type = chunk.get('chunk_type', '')

	# Create a descriptive header for the code content
	header = f"Type: {chunk_type}, Name: {name}\nDocstring: {docstring}\n---\n"
	prepared_text = header + chunk['content']
	texts_to_embed.append(prepared_text)

	# 3. Generate Embeddings
	print(f"Loading SentenceTransformer model: '{model_name}'...")
	# Using a model well-suited for code is beneficial, but a general one works too.
	# Consider models like 'microsoft/codebert-base' or 'all-MiniLM-L6-v2' for a start.
	model = SentenceTransformer(model_name).half() # Convert the model to half precision for faster inference
	# model to fp16 for faster inference
	# model = SentenceTransformer(model_name, device='cpu').half()




	print("Generating embeddings for filtered chunks... (This may take a while)")
	# embeddings = model.encode(texts_to_embed, show_progress_bar=True)
	# Define a batch size
	batch_size = 2 # You can adjust this number based on your VRAM

	print("Generating embeddings for filtered chunks... (This may take a while)")
	embeddings = model.encode(
	texts_to_embed,
	batch_size=batch_size,
	show_progress_bar=True
	)

	# Convert to float32 for FAISS
	embeddings = np.array(embeddings).astype('float32')
	dimension = embeddings.shape[1]
	print(f"Embeddings generated with dimension: {dimension}")

	# 4. Build and Save FAISS Index
	print("Building FAISS index...")
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings)

	print(f"Saving FAISS index to '{output_index_path}'...")
	faiss.write_index(index, output_index_path)

	# 5. Save Metadata for Mapping
	# We need to save the original chunk info to map FAISS results back to the source code
	metadata_to_save = [
	{
	"chunk_id": chunk.get("chunk_id"),
	"file_path": chunk.get("file_path"),
	"start_line": chunk.get("start_line"),
	"end_line": chunk.get("end_line"),
	"name": chunk.get("name"),
	"chunk_type": chunk.get("chunk_type")
	}
	for chunk in filtered_chunks
	]

	print(f"Saving metadata mapping to '{output_metadata_path}'...")
	with open(output_metadata_path, 'w', encoding='utf-8') as f:
	json.dump(metadata_to_save, f, indent=2)

	print("\nProcess complete!")
	print(f"FAISS index and metadata have been successfully saved.")


	if __name__ == "__main__":
	# --- CONFIGURATION ---
	CHUNKS_JSON_PATH = "code_chunks.json"

	# Recommended model for general purpose, good balance of speed and quality.
	# For more code-specific tasks, you might explore models like 'microsoft/codebert-base'.
	MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"

	OUTPUT_INDEX_PATH = "code_faiss.index"
	OUTPUT_METADATA_PATH = "code_metadata.json"

	# --- EXECUTION ---
	create_code_vector_db(
	json_file_path=CHUNKS_JSON_PATH,
	model_name=MODEL_NAME,
	output_index_path=OUTPUT_INDEX_PATH,
	output_metadata_path=OUTPUT_METADATA_PATH
	)