Update rag.py

6a596d4 verified 14 days ago

55.2 kB

	#!/usr/bin/env python3
	"""
	Entropy-Based Radial Chunking Search Engine

	This module implements an advanced search engine that uses high-entropy tokens
	as semantic centers for intelligent text chunking and retrieval. The system
	employs semantic binary search and adaptive chunking strategies for efficient
	information retrieval.

	The algorithm works by:
	1. Calculating entropy for each token based on its embedding
	2. Identifying high-entropy tokens as semantic centers
	3. Creating chunks around these centers
	4. Performing semantic search using binary search in the semantic space

	Module: rag.py
	Author: Aninokuma from Stealth Hut
	Date: 2025
	Version: 1.0.0
	License: MIT
	"""

	import sqlite3
	import numpy as np
	from safetensors import safe_open
	from transformers import AutoTokenizer
	import time
	import os
	from collections import defaultdict, Counter
	import math
	from contextlib import contextmanager
	from typing import List, Dict, Tuple, Optional, Any


	class EntropyRadialSearch:
	"""Search engine with entropy-based radial chunking

	This class implements an advanced search system that uses high-entropy tokens
	as semantic centers to create meaningful text chunks. The system provides
	efficient retrieval through semantic binary search and adaptive chunking
	strategies.

	Attributes:
	tokenizer: Pre-trained tokenizer for tokenizing text
	embeddings: Token embeddings loaded from the model
	db_path: Path to the SQLite database for storing chunks and metadata
	"""

	def __init__(self, model_dir: str = "qwen3_int8_harmonic", db_path: str = "entropy_radial_search.db"):
	"""Initialize the EntropyRadialSearch instance.

	Args:
	model_dir: Directory containing the pre-trained model files
	db_path: Path to the SQLite database file
	"""
	self._log_message("Loading Entropy-Based Radial Search Engine")

	# Load model
	self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
	tensors = {}
	with safe_open(f"{model_dir}/model.safetensors", framework="numpy") as f:
	for key in f.keys():
	tensors[key] = f.get_tensor(key)
	self.embeddings = tensors["embeddings"]
	self.weights = tensors["weights"] # Load the weights tensor for rarity computation
	self._log_message(f"Model loaded: {len(self.embeddings)} tokens, weights tensor: {len(self.weights)} values")

	# Initialize database
	self.db_path = db_path
	self.init_database()
	self._log_message(f"Database ready: {db_path}")

	def _log_message(self, message: str) -> None:
	"""Log a message to standard output.

	Args:
	message: The message to log
	"""
	print(f"[LOG] {message}")

	@contextmanager
	def get_connection(self):
	"""SQLite connection context manager

	Provides a database connection that is automatically closed after use.

	Yields:
	sqlite3.Connection: Database connection object
	"""
	conn = sqlite3.connect(self.db_path)
	conn.row_factory = sqlite3.Row
	try:
	yield conn
	finally:
	conn.close()

	def init_database(self) -> None:
	"""Initialize database with enhanced schema

	Creates the necessary tables and indexes for storing document chunks,
	entropy analysis data, and entropy centers. This method ensures all
	required database structures exist before the search engine is used.
	"""
	try:
	with self.get_connection() as conn:
	# Documents table with enhanced chunk info
	conn.execute("""
	CREATE TABLE IF NOT EXISTS documents (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	chunk_id INTEGER,
	content TEXT,
	token_start INTEGER,
	token_end INTEGER,
	center_token TEXT,
	entropy_score REAL,
	radius INTEGER,
	adaptive_radius BOOLEAN DEFAULT FALSE,
	local_density REAL DEFAULT 0.0,
	fallback_chunk BOOLEAN DEFAULT FALSE,
	embedding BLOB,
	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
	)
	""")

	# Entropy analysis table
	conn.execute("""
	CREATE TABLE IF NOT EXISTS entropy_analysis (
	filename TEXT PRIMARY KEY,
	total_tokens INTEGER,
	total_chunks INTEGER,
	entropy_threshold REAL,
	high_entropy_tokens INTEGER,
	avg_chunk_size REAL,
	processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
	)
	""")

	# High-entropy centers table
	conn.execute("""
	CREATE TABLE IF NOT EXISTS entropy_centers (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	filename TEXT,
	token_position INTEGER,
	token_text TEXT,
	entropy_score REAL,
	chunk_assigned BOOLEAN DEFAULT FALSE
	)
	""")

	# Indexes
	conn.execute("CREATE INDEX IF NOT EXISTS idx_chunk_id ON documents(chunk_id)")
	conn.execute("CREATE INDEX IF NOT EXISTS idx_entropy_score ON documents(entropy_score)")
	conn.commit()
	except sqlite3.Error as e:
	self._log_message(f"Database initialization error: {e}")
	raise

	def calculate_token_entropy(self, tokens: List[int]) -> List[Tuple[int, str, float]]:
	"""Calculate entropy for each token based on vector embedding entropy.

	This method calculates entropy scores for tokens using both Shannon entropy
	of normalized vector components and vector variance to identify
	high-entropy tokens that serve as semantic centers.

	Args:
	tokens: List of token IDs to calculate entropy for

	Returns:
	List of tuples containing (position, token_text, entropy_score)
	"""
	self._log_message("Calculating vector entropy...")

	entropy_scores: List[Tuple[int, str, float]] = []

	# Calculate entropy for each token's embedding
	for i, token_id in enumerate(tokens):
	if token_id < len(self.embeddings):
	vector = self.embeddings[token_id]

	# Method 1: Shannon entropy of normalized vector components
	normalized_vector = vector / np.linalg.norm(vector)
	# Add small epsilon to avoid log(0)
	epsilon = 1e-10
	# Use absolute values to avoid negative log values
	entropy = -np.sum(np.abs(normalized_vector) * np.log2(np.abs(normalized_vector) + epsilon))

	# Method 2: Vector variance (higher variance = more entropy)
	variance = np.var(vector)
	variance_boost = 1.0 + variance * 0.1

	# Incorporate weights for rarity - higher weights indicate rarer tokens
	if token_id < len(self.weights):
	# Weights are in range [0, 127], higher means rarer/more important
	weight_value = self.weights[token_id].item()
	# Normalize weight to create a rarity multiplier (e.g., 1.0 to 2.0 range)
	rarity_multiplier = 1.0 + (weight_value / 127.0) * 1.0 # Boost up to 2x
	else:
	rarity_multiplier = 1.0

	# Combined entropy score with rarity adjustment
	entropy_score = entropy * variance_boost * rarity_multiplier

	# Get token text for reference
	token_text = self.tokenizer.decode([token_id])

	entropy_scores.append((i, token_text, entropy_score))

	return entropy_scores

	def find_entropy_centers(self, entropy_scores: List[Tuple[int, str, float]], percentile: int = 99) -> Tuple[List[Dict[str, Any]], float]:
	"""Find high-entropy tokens to use as chunk centers.

	This method identifies tokens with entropy scores above the specified
	percentile to serve as semantic centers for chunking.

	Args:
	entropy_scores: List of (position, token_text, entropy_score) tuples
	percentile: Percentile threshold for selecting high-entropy tokens

	Returns:
	Tuple containing list of center dictionaries and entropy threshold
	"""
	self._log_message(f"Finding entropy centers (> {percentile}th percentile)...")

	# Calculate percentile threshold
	scores = [score for _, _, score in entropy_scores]
	if not scores:
	return [], 0.0
	threshold = np.percentile(scores, percentile)

	self._log_message(f" Entropy threshold: {threshold:.3f}")

	# Find high-entropy tokens
	centers = []
	for i, token, score in entropy_scores:
	if score >= threshold:
	centers.append({
	'position': i,
	'token': token,
	'score': score
	})

	# Sort by entropy score (highest first)
	centers.sort(key=lambda x: x['score'], reverse=True)

	self._log_message(f" Found {len(centers)} entropy centers")
	return centers, threshold

	def calculate_adaptive_radius(self, center_score: float, avg_entropy: float, local_entropy_density: float, base_radius: int = 100) -> int:
	"""Calculate adaptive radius based on entropy and local density.

	This method calculates a dynamic radius for chunking based on the center
	token's entropy score and the local entropy density.

	Args:
	center_score: Entropy score of the center token
	avg_entropy: Average entropy across all tokens
	local_entropy_density: Density of high-entropy tokens in the local area
	base_radius: Base radius to use as reference

	Returns:
	Integer radius value bounded between 50 and 200 tokens
	"""
	# Higher entropy centers get larger radius
	entropy_factor = center_score / avg_entropy if avg_entropy > 0 else 1.0

	# Lower density areas get larger radius (sparse regions need more coverage)
	density_factor = 1.0 / (1.0 + local_entropy_density)

	# Adaptive radius with bounds
	adaptive_radius = int(base_radius * entropy_factor * density_factor)
	return max(50, min(200, adaptive_radius)) # Bound between 50-200 tokens

	def calculate_local_entropy_density(self, centers: List[Dict[str, Any]], position: int, window: int = 50) -> float:
	"""Calculate entropy density around a position.

	This method calculates the density of high-entropy tokens within a
	specified window around a given position.

	Args:
	centers: List of high-entropy token centers
	position: Position to calculate density around
	window: Size of the window for density calculation

	Returns:
	Float representing the entropy density
	"""
	nearby_centers = [c for c in centers if abs(c['position'] - position) <= window]
	if not nearby_centers:
	return 0.0
	return len(nearby_centers) / (2 * window + 1)

	def is_semantic_boundary(self, token_id: int, tokens: List[int], pos: int) -> bool:
	"""Check if token position represents a language-agnostic semantic boundary.

	This method determines if a token position represents a semantic boundary
	by analyzing embedding similarities with neighboring tokens.

	Args:
	token_id: ID of the current token
	tokens: List of all token IDs
	pos: Position of the current token in the token list

	Returns:
	Boolean indicating whether this position is a semantic boundary
	"""
	if pos < 0 or pos >= len(tokens):
	return True # Document boundaries are always boundaries

	# Get current token embedding
	if token_id >= len(self.embeddings):
	return True
	current_vector = self.embeddings[token_id]

	# Calculate semantic similarity with neighbors
	boundary_scores = []

	# Check left semantic discontinuity
	if pos > 0:
	left_token_id = tokens[pos - 1]
	if left_token_id < len(self.embeddings):
	left_vector = self.embeddings[left_token_id]
	# Cosine similarity (lower = more boundary-like)
	similarity = np.dot(current_vector, left_vector) / (
	np.linalg.norm(current_vector) * np.linalg.norm(left_vector)
	)
	boundary_scores.append(1.0 - similarity) # Convert to boundary score

	# Check right semantic discontinuity
	if pos < len(tokens) - 1:
	right_token_id = tokens[pos + 1]
	if right_token_id < len(self.embeddings):
	right_vector = self.embeddings[right_token_id]
	similarity = np.dot(current_vector, right_vector) / (
	np.linalg.norm(current_vector) * np.linalg.norm(right_vector)
	)
	boundary_scores.append(1.0 - similarity) # Convert to boundary score

	# Check larger context (2 tokens away) for broader boundaries
	if pos > 1:
	left2_token_id = tokens[pos - 2]
	if left2_token_id < len(self.embeddings):
	left2_vector = self.embeddings[left2_token_id]
	similarity = np.dot(current_vector, left2_vector) / (
	np.linalg.norm(current_vector) * np.linalg.norm(left2_vector)
	)
	boundary_scores.append(1.0 - similarity)

	if pos < len(tokens) - 2:
	right2_token_id = tokens[pos + 2]
	if right2_token_id < len(self.embeddings):
	right2_vector = self.embeddings[right2_token_id]
	similarity = np.dot(current_vector, right2_vector) / (
	np.linalg.norm(current_vector) * np.linalg.norm(right2_vector)
	)
	boundary_scores.append(1.0 - similarity)

	# Average boundary score
	if boundary_scores:
	avg_boundary_score = sum(boundary_scores) / len(boundary_scores)

	# Check if vector entropy is also high (indicating transition)
	if token_id < len(self.embeddings):
	vector = self.embeddings[token_id]
	normalized_vector = vector / np.linalg.norm(vector)
	epsilon = 1e-10
	# Use absolute values to avoid negative log values
	entropy = -np.sum(np.abs(normalized_vector) * np.log2(np.abs(normalized_vector) + epsilon))

	# Incorporate weights for rarity - higher weights indicate rarer tokens
	if token_id < len(self.weights):
	weight_value = self.weights[token_id].item()
	rarity_multiplier = 1.0 + (weight_value / 127.0) * 0.5 # Moderate boost for boundaries
	else:
	rarity_multiplier = 1.0
	entropy = entropy * rarity_multiplier

	# Combine semantic discontinuity with entropy
	combined_score = avg_boundary_score * (1.0 + entropy * 0.001)

	# Threshold determined empirically - works across languages
	return combined_score > 0.4

	return False

	def expand_to_boundaries(self, tokens: List[int], center_pos: int, max_expansion: int = 500) -> Tuple[int, int]:
	"""Naturally expand from center until hitting natural boundaries.

	This method expands from a center position in both directions until
	semantic boundaries are encountered.

	Args:
	tokens: List of token IDs
	center_pos: Starting position for expansion
	max_expansion: Maximum number of tokens to expand in each direction

	Returns:
	Tuple containing start and end positions for the chunk
	"""
	start_pos = center_pos
	end_pos = center_pos

	# Expand left until boundary
	left_expansion = 0
	while start_pos > 0 and left_expansion < max_expansion:
	prev_pos = start_pos - 1
	if self.is_semantic_boundary(tokens[prev_pos], tokens, prev_pos):
	break
	start_pos = prev_pos
	left_expansion += 1

	# Expand right until boundary
	right_expansion = 0
	while end_pos < len(tokens) - 1 and right_expansion < max_expansion:
	next_pos = end_pos + 1
	if self.is_semantic_boundary(tokens[next_pos], tokens, next_pos):
	break
	end_pos = next_pos
	right_expansion += 1

	return start_pos, end_pos + 1 # +1 for inclusive range

	def create_radial_chunks(self, tokens: List[int], centers: List[Dict[str, Any]], max_expansion: int = 500) -> List[Dict[str, Any]]:
	"""Slice corpus at midpoints between 99th percentile high-entropy tokens.

	This method creates chunks by slicing the text at midpoints between
	high-entropy token centers.

	Args:
	tokens: List of all token IDs in the document
	centers: List of high-entropy token centers
	max_expansion: Maximum expansion for chunk boundaries (not used in this method)

	Returns:
	List of chunk dictionaries with position and content information
	"""
	self._log_message(f"SLICING corpus at midpoints between {len(centers)} high-entropy centers...")

	if not centers:
	return []

	# Sort centers by position
	sorted_centers = sorted(centers, key=lambda x: x['position'])

	chunks = []

	# First chunk: start to first midpoint
	if len(sorted_centers) > 1:
	first_center = sorted_centers[0]
	second_center = sorted_centers[1]
	midpoint = (first_center['position'] + second_center['position']) // 2
	start_pos = 0
	end_pos = midpoint + 1

	chunk_tokens = tokens[start_pos:end_pos]
	chunks.append({
	'center_pos': first_center['position'],
	'center_token': first_center['token'],
	'center_score': first_center['score'],
	'start_pos': start_pos,
	'end_pos': end_pos,
	'tokens': chunk_tokens,
	'chunk_id': len(chunks)
	})

	# Middle chunks: between adjacent midpoints
	for i in range(1, len(sorted_centers) - 1):
	current_center = sorted_centers[i]
	prev_center = sorted_centers[i-1]
	next_center = sorted_centers[i+1]

	left_midpoint = (prev_center['position'] + current_center['position']) // 2
	right_midpoint = (current_center['position'] + next_center['position']) // 2

	start_pos = left_midpoint + 1
	end_pos = right_midpoint + 1

	chunk_tokens = tokens[start_pos:end_pos]
	chunks.append({
	'center_pos': current_center['position'],
	'center_token': current_center['token'],
	'center_score': current_center['score'],
	'start_pos': start_pos,
	'end_pos': end_pos,
	'tokens': chunk_tokens,
	'chunk_id': len(chunks)
	})

	# Last chunk: last midpoint to end
	if len(sorted_centers) > 1:
	last_center = sorted_centers[-1]
	prev_center = sorted_centers[-2]
	midpoint = (prev_center['position'] + last_center['position']) // 2
	start_pos = midpoint + 1
	end_pos = len(tokens)

	chunk_tokens = tokens[start_pos:end_pos]
	chunks.append({
	'center_pos': last_center['position'],
	'center_token': last_center['token'],
	'center_score': last_center['score'],
	'start_pos': start_pos,
	'end_pos': end_pos,
	'tokens': chunk_tokens,
	'chunk_id': len(chunks)
	})

	# If only one center, whole document is one chunk
	elif len(sorted_centers) == 1:
	center = sorted_centers[0]
	chunks.append({
	'center_pos': center['position'],
	'center_token': center['token'],
	'center_score': center['score'],
	'start_pos': 0,
	'end_pos': len(tokens),
	'tokens': tokens,
	'chunk_id': len(chunks)
	})

	self._log_message(f" Sliced corpus into {len(chunks)} chunks at high-entropy midpoints")
	self._log_message(f" Perfect coverage: 100% (corpus fully partitioned)")
	self._log_message(f" Avg chunk size: {len(tokens)/len(chunks):.1f} tokens")

	return chunks

	def semantic_binary_search(self, query: str, max_depth: int = 5) -> List[Dict[str, Any]]:
	"""Navigate corpus using semantic binary search on 99th percentile entropy tokens.

	This method performs a binary search in semantic space by analyzing
	high-entropy tokens in chunks and navigating toward the most relevant
	content based on query similarity.

	Args:
	query: Query string to search for
	max_depth: Maximum depth for semantic search navigation

	Returns:
	List of context chunks around the stopping point
	"""
	self._log_message(f"SEMANTIC BINARY SEARCH: '{query}'")
	self._log_message("=" * 60)

	# Encode query
	query_embedding = self.encode_text(query)

	# Get all chunks with their centers
	with self.get_connection() as conn:
	cursor = conn.execute("""
	SELECT chunk_id, center_token, entropy_score, token_start, token_end, content, embedding
	FROM documents
	ORDER BY chunk_id
	""")

	chunks = []
	for row in cursor:
	doc_embedding = np.frombuffer(row['embedding'], dtype=np.float32)
	similarity = self.cosine_similarity(query_embedding, doc_embedding)
	chunks.append({
	'chunk_id': row['chunk_id'],
	'center_token': row['center_token'],
	'center_score': row['entropy_score'],
	'token_start': row['token_start'],
	'token_end': row['token_end'],
	'content': row['content'],
	'similarity': similarity
	})

	# Sort by similarity to find best starting point
	chunks.sort(key=lambda x: x['similarity'], reverse=True)

	if not chunks:
	self._log_message("No chunks found for semantic search")
	return []

	# Start with best chunk
	current_chunk = chunks[0]
	path = [current_chunk]
	self._log_message(f"Starting at Chunk {current_chunk['chunk_id']} (similarity: {current_chunk['similarity']:.3f})")
	self._log_message(f" Center: '{current_chunk['center_token']}' (entropy: {current_chunk['center_score']:.2f})")

	for depth in range(max_depth):
	self._log_message(f"\nDepth {depth + 1}: Analyzing Chunk {current_chunk['chunk_id']}")

	# Get tokens for this chunk (excluding center)
	chunk_tokens = []
	with self.get_connection() as conn:
	cursor = conn.execute("""
	SELECT content, token_start, token_end FROM documents WHERE chunk_id = ?
	""", (current_chunk['chunk_id'],))
	row = cursor.fetchone()
	if row:
	# Reconstruct chunk tokens from stored content
	chunk_text = row['content']
	chunk_tokens = self.tokenizer.encode(chunk_text, add_special_tokens=False)
	# Update current_chunk with token positions
	current_chunk['token_start'] = row['token_start']
	current_chunk['token_end'] = row['token_end']

	if len(chunk_tokens) < 3:
	self._log_message(f" Chunk too small for binary navigation")
	break

	# Find center position in chunk tokens
	center_token_id = self.tokenizer.encode(current_chunk['center_token'], add_special_tokens=False)[0] if self.tokenizer.encode(current_chunk['center_token'], add_special_tokens=False) else None

	if center_token_id is None:
	self._log_message(f" Cannot find center token in chunk")
	break

	try:
	center_pos = chunk_tokens.index(center_token_id)
	except ValueError:
	center_pos = len(chunk_tokens) // 2 # Fallback to middle
	self._log_message(f" Center token not found, using middle position {center_pos}")

	# Split into left and right sections
	left_tokens = chunk_tokens[:center_pos]
	right_tokens = chunk_tokens[center_pos + 1:]

	self._log_message(f" Left section: {len(left_tokens)} tokens")
	self._log_message(f" Right section: {len(right_tokens)} tokens")

	# Find 99th percentile high-entropy tokens in each section
	left_entropy_scores = []
	for i, token_id in enumerate(left_tokens):
	if token_id < len(self.embeddings):
	vector = self.embeddings[token_id]
	normalized_vector = vector / np.linalg.norm(vector)
	epsilon = 1e-10
	entropy = -np.sum(normalized_vector * np.log2(np.abs(normalized_vector) + epsilon))
	variance = np.var(vector)
	# Incorporate weights for rarity - higher weights indicate rarer tokens
	if token_id < len(self.weights):
	weight_value = self.weights[token_id].item()
	rarity_multiplier = 1.0 + (weight_value / 127.0) * 1.0 # Boost up to 2x
	else:
	rarity_multiplier = 1.0
	entropy_score = entropy * (1.0 + variance * 0.1) * rarity_multiplier
	left_entropy_scores.append((i, token_id, entropy_score))

	right_entropy_scores = []
	for i, token_id in enumerate(right_tokens):
	if token_id < len(self.embeddings):
	vector = self.embeddings[token_id]
	normalized_vector = vector / np.linalg.norm(vector)
	epsilon = 1e-10
	entropy = -np.sum(normalized_vector * np.log2(np.abs(normalized_vector) + epsilon))
	variance = np.var(vector)
	# Incorporate weights for rarity - higher weights indicate rarer tokens
	if token_id < len(self.weights):
	weight_value = self.weights[token_id].item()
	rarity_multiplier = 1.0 + (weight_value / 127.0) * 1.0 # Boost up to 2x
	else:
	rarity_multiplier = 1.0
	entropy_score = entropy * (1.0 + variance * 0.1) * rarity_multiplier
	right_entropy_scores.append((i, token_id, entropy_score))

	# Get 99th percentile threshold for each section
	if left_entropy_scores:
	left_scores = [score for _, _, score in left_entropy_scores]
	left_threshold = np.percentile(left_scores, 99) if len(left_scores) > 0 else 0
	left_high_entropy = [(pos, token_id, score) for pos, token_id, score in left_entropy_scores if score >= left_threshold]
	else:
	left_high_entropy = []
	left_threshold = 0

	if right_entropy_scores:
	right_scores = [score for _, _, score in right_entropy_scores]
	right_threshold = np.percentile(right_scores, 99) if len(right_scores) > 0 else 0
	right_high_entropy = [(pos, token_id, score) for pos, token_id, score in right_entropy_scores if score >= right_threshold]
	else:
	right_high_entropy = []
	right_threshold = 0

	self._log_message(f" Left 99th percentile threshold: {left_threshold:.2f} ({len(left_high_entropy)} tokens)")
	self._log_message(f" Right 99th percentile threshold: {right_threshold:.2f} ({len(right_high_entropy)} tokens)")

	if not left_high_entropy and not right_high_entropy:
	self._log_message(f" No high-entropy tokens found - search complete")
	break

	# Calculate query similarity to high-entropy tokens
	left_similarity_sum = 0
	right_similarity_sum = 0

	for pos, token_id, score in left_high_entropy:
	if token_id < len(self.embeddings):
	token_embedding = self.embeddings[token_id]
	similarity = self.cosine_similarity(query_embedding, token_embedding)
	left_similarity_sum += similarity * score

	for pos, token_id, score in right_high_entropy:
	if token_id < len(self.embeddings):
	token_embedding = self.embeddings[token_id]
	similarity = self.cosine_similarity(query_embedding, token_embedding)
	right_similarity_sum += similarity * score

	self._log_message(f" Left query similarity: {left_similarity_sum:.3f}")
	self._log_message(f" Right query similarity: {right_similarity_sum:.3f}")

	# Decide direction
	if left_similarity_sum > right_similarity_sum:
	direction = "LEFT"
	self._log_message(f" Query more similar to LEFT section")
	elif right_similarity_sum > left_similarity_sum:
	direction = "RIGHT"
	self._log_message(f" Query more similar to RIGHT section")
	else:
	direction = "EQUAL"
	self._log_message(f" Equal similarity - search complete")
	break

	# Find adjacent chunk in that direction
	with self.get_connection() as conn:
	if direction == "LEFT":
	cursor = conn.execute("""
	SELECT chunk_id, center_token, entropy_score, content, embedding
	FROM documents
	WHERE token_end < ?
	ORDER BY token_end DESC
	LIMIT 1
	""", (current_chunk['token_start'],))
	else: # RIGHT
	cursor = conn.execute("""
	SELECT chunk_id, center_token, entropy_score, content, embedding
	FROM documents
	WHERE token_start > ?
	ORDER BY token_start ASC
	LIMIT 1
	""", (current_chunk['token_end'],))

	next_row = cursor.fetchone()
	if next_row:
	next_embedding = np.frombuffer(next_row['embedding'], dtype=np.float32)
	next_similarity = self.cosine_similarity(query_embedding, next_embedding)

	current_chunk = {
	'chunk_id': next_row['chunk_id'],
	'center_token': next_row['center_token'],
	'center_score': next_row['entropy_score'],
	'content': next_row['content'],
	'similarity': next_similarity
	}

	path.append(current_chunk)
	self._log_message(f" Moved {direction} to Chunk {current_chunk['chunk_id']} (similarity: {current_chunk['similarity']:.3f})")
	self._log_message(f" Center: '{current_chunk['center_token']}' (entropy: {current_chunk['center_score']:.2f})")
	self._log_message(f" Content: \"{current_chunk['content'][:100]}...\"")
	else:
	self._log_message(f" No more chunks in {direction} direction")
	break

	self._log_message(f"\nSEMANTIC SEARCH PATH:")
	self._log_message("-" * 40)
	for i, chunk in enumerate(path):
	self._log_message(f"Step {i}: Chunk {chunk['chunk_id']} (similarity: {chunk['similarity']:.3f})")
	self._log_message(f" Center: '{chunk['center_token']}' - \"{chunk['content'][:80]}...\"")

	# Return context window around stopping point (~512 target tokens)
	return self.get_context_window(current_chunk, query_embedding, target_tokens=512)

	def get_context_window(self, center_chunk, query_embedding, target_tokens=512):
	"""Get surrounding chunks around the stopping point until reaching target token count"""
	print(f"\n🔍 GETTING CONTEXT WINDOW: ~{target_tokens} tokens around Chunk {center_chunk['chunk_id']}")
	self._log_message("=" * 60)

	with self.get_connection() as conn:
	# Get all chunks ordered by chunk_id for token-based expansion
	cursor = conn.execute("""
	SELECT chunk_id, center_token, entropy_score, token_start, token_end, content, embedding
	FROM documents
	ORDER BY chunk_id
	""")

	all_chunks = []
	for row in cursor:
	doc_embedding = np.frombuffer(row['embedding'], dtype=np.float32)
	similarity = self.cosine_similarity(query_embedding, doc_embedding)
	tokens_in_chunk = row['token_end'] - row['token_start']
	all_chunks.append({
	'chunk_id': row['chunk_id'],
	'center_token': row['center_token'],
	'center_score': row['entropy_score'],
	'token_start': row['token_start'],
	'token_end': row['token_end'],
	'content': row['content'],
	'similarity': similarity,
	'tokens_count': tokens_in_chunk,
	'direction': 'CENTER' if row['chunk_id'] == center_chunk['chunk_id'] else None
	})

	# Find the center chunk position in the ordered list
	center_idx = None
	for i, chunk in enumerate(all_chunks):
	if chunk['chunk_id'] == center_chunk['chunk_id']:
	center_idx = i
	chunk['direction'] = 'CENTER' # Mark as center
	break

	if center_idx is None:
	self._log_message("Center chunk not found in all chunks list")
	return []

	# Expand outwards from the center chunk until we reach target token count
	context_chunks = [all_chunks[center_idx]]
	total_tokens = all_chunks[center_idx]['tokens_count']

	left_idx = center_idx - 1
	right_idx = center_idx + 1

	while total_tokens < target_tokens:
	# Determine which direction to expand (whichever has more available chunks or smaller addition)
	left_available = left_idx >= 0
	right_available = right_idx < len(all_chunks)

	if not left_available and not right_available:
	break # No more chunks to add

	# If only one side is available, take from that side
	if left_available and not right_available:
	next_chunk = all_chunks[left_idx]
	left_idx -= 1
	elif right_available and not left_available:
	next_chunk = all_chunks[right_idx]
	right_idx += 1
	else:
	# Both sides available, take the one with fewer tokens to balance context
	left_chunk = all_chunks[left_idx]
	right_chunk = all_chunks[right_idx]

	if left_chunk['tokens_count'] <= right_chunk['tokens_count']:
	next_chunk = left_chunk
	left_idx -= 1
	else:
	next_chunk = right_chunk
	right_idx += 1

	# Add direction indicator
	direction = 'BEFORE' if next_chunk['chunk_id'] < center_chunk['chunk_id'] else 'AFTER'
	next_chunk['direction'] = direction

	context_chunks.append(next_chunk)
	total_tokens += next_chunk['tokens_count']

	# Sort by similarity for display and return
	context_chunks_sorted = sorted(context_chunks, key=lambda x: x['similarity'], reverse=True)

	self._log_message(f"CONTEXT WINDOW RESULTS:")
	self._log_message(f" Total chunks: {len(context_chunks)}")
	self._log_message(f" Total tokens: {total_tokens}")
	self._log_message(f" Range: Chunk {min(c['chunk_id'] for c in context_chunks)} to Chunk {max(c['chunk_id'] for c in context_chunks)}")
	self._log_message(f" Center: Chunk {center_chunk['chunk_id']} ('{center_chunk['center_token']}')")

	self._log_message(f"\nTOP 10 MOST SIMILAR CHUNKS IN CONTEXT:")
	self._log_message("-" * 50)
	for i, chunk in enumerate(context_chunks_sorted[:10], 1):
	self._log_message(f"{i}. Chunk {chunk['chunk_id']} (similarity: {chunk['similarity']:.3f})")
	self._log_message(f" Direction: {chunk['direction']}, Tokens: {chunk['tokens_count']}")
	self._log_message(f" Center: '{chunk['center_token']}' (entropy: {chunk['center_score']:.2f})")
	self._log_message(f" Content: \"{chunk['content'][:100]}...\"")
	self._log_message("")

	# Look for specific keywords in the context
	keywords = ['radar', 'cavity', 'microwave', 'waveguide', 'standing', 'waves', 'zero-point', 'energy']
	keyword_matches = []

	for chunk in context_chunks:
	content_lower = chunk['content'].lower()
	for keyword in keywords:
	if keyword in content_lower:
	if chunk not in keyword_matches:
	keyword_matches.append(chunk)
	break

	if keyword_matches:
	self._log_message(f"\nKEYWORD MATCHES FOUND:")
	self._log_message("-" * 40)
	for i, chunk in enumerate(keyword_matches[:5], 1):
	self._log_message(f"{i}. Chunk {chunk['chunk_id']} (similarity: {chunk['similarity']:.3f})")
	# Show the keyword match
	content_lower = chunk['content'].lower()
	matched_keywords = [kw for kw in keywords if kw in content_lower]
	self._log_message(f" Keywords: {', '.join(matched_keywords)}")
	self._log_message(f" Content: \"{chunk['content'][:150]}...\"")
	self._log_message("")

	self._log_message(f"CONTEXT WINDOW SEARCH COMPLETE")
	self._log_message(f" Analyzed {len(context_chunks)} chunks (~{total_tokens} tokens) around the semantic search stopping point")

	return context_chunks_sorted

	def create_fallback_chunks(self, tokens: List[int], unassigned_positions: List[int], start_chunk_id: int, fallback_size: int = 64) -> List[Dict[str, Any]]:
	"""Create chunks for unassigned tokens to ensure complete coverage.

	This method creates fallback chunks for any tokens that weren't assigned
	to high-entropy center-based chunks, ensuring complete document coverage.

	Args:
	tokens: List of all token IDs in the document
	unassigned_positions: List of token positions that weren't assigned to chunks
	start_chunk_id: Initial chunk ID to use for the fallback chunks
	fallback_size: Size of each fallback chunk

	Returns:
	List of fallback chunk dictionaries
	"""
	fallback_chunks = []

	# Group consecutive unassigned tokens
	groups = []
	current_group = []

	for pos in unassigned_positions:
	if not current_group or pos == current_group[-1] + 1:
	current_group.append(pos)
	else:
	groups.append(current_group)
	current_group = [pos]
	if current_group:
	groups.append(current_group)

	# Create chunks for each group
	for group in groups:
	for i in range(0, len(group), fallback_size):
	chunk_start = group[i]
	chunk_end = group[min(i + fallback_size - 1, len(group) - 1)]
	chunk_tokens = tokens[chunk_start:chunk_end + 1]

	fallback_chunks.append({
	'center_pos': (chunk_start + chunk_end) // 2,
	'center_token': 'fallback',
	'center_score': 0.0,
	'start_pos': chunk_start,
	'end_pos': chunk_end + 1,
	'tokens': chunk_tokens,
	'chunk_id': start_chunk_id + len(fallback_chunks),
	'radius': (chunk_end - chunk_start + 1) // 2,
	'adaptive_radius': False,
	'fallback': True,
	'local_density': 0.0
	})

	return fallback_chunks

	def process_text_file(self, file_path: str, entropy_percentile: int = 85, chunk_radius: int = 100, enable_adaptive: bool = True) -> None:
	"""Process text file with improved entropy-based radial chunking.

	This method processes a text file by tokenizing it, calculating entropy
	scores for each token, finding high-entropy centers, and creating
	radial chunks around these centers.

	Args:
	file_path: Path to the text file to process
	entropy_percentile: Percentile for selecting high-entropy tokens
	chunk_radius: Base radius for chunks
	enable_adaptive: Whether to enable adaptive chunking
	"""
	self._log_message(f"Processing: {file_path}")
	self._log_message(f" Entropy percentile: {entropy_percentile}%, Base radius: {chunk_radius}, Adaptive: {enable_adaptive}")

	# Read file
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	text = f.read()

	# Tokenize
	self._log_message("Tokenizing text...")
	tokens = self.tokenizer.encode(text, add_special_tokens=False)
	self._log_message(f" Total tokens: {len(tokens):,}")

	# Calculate entropy scores
	entropy_scores = self.calculate_token_entropy(tokens)

	# Find entropy centers
	centers, threshold = self.find_entropy_centers(entropy_scores, entropy_percentile)

	# Create radial chunks
	chunks = self.create_radial_chunks(tokens, centers, max_expansion=chunk_radius)

	# Clear existing data
	with self.get_connection() as conn:
	conn.execute("DELETE FROM documents")
	conn.execute("DELETE FROM entropy_centers")
	conn.commit()

	# Process and store chunks
	start_time = time.time()

	with self.get_connection() as conn:
	for chunk in chunks:
	# Decode chunk back to text
	chunk_text = self.tokenizer.decode(chunk['tokens'])

	# Generate embedding
	embedding = self.encode_text(chunk_text)

	# Store in database with enhanced metadata
	conn.execute("""
	INSERT INTO documents
	(chunk_id, content, token_start, token_end, center_token, entropy_score,
	radius, adaptive_radius, local_density, fallback_chunk, embedding)
	VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
	""", (
	chunk['chunk_id'],
	chunk_text,
	chunk['start_pos'],
	chunk['end_pos'],
	chunk['center_token'],
	chunk['center_score'],
	chunk.get('radius', 0),
	chunk.get('adaptive_radius', False),
	chunk.get('local_density', 0.0),
	chunk.get('fallback', False),
	embedding.tobytes()
	))

	# Store entropy centers
	for center in centers:
	conn.execute("""
	INSERT INTO entropy_centers
	(filename, token_position, token_text, entropy_score)
	VALUES (?, ?, ?, ?)
	""", (os.path.basename(file_path), center['position'], center['token'], center['score']))

	# Store metadata
	avg_chunk_size = sum(len(c['tokens']) for c in chunks) / len(chunks) if chunks else 0
	conn.execute("""
	INSERT OR REPLACE INTO entropy_analysis
	(filename, total_tokens, total_chunks, entropy_threshold, high_entropy_tokens, avg_chunk_size)
	VALUES (?, ?, ?, ?, ?, ?)
	""", (
	os.path.basename(file_path),
	len(tokens),
	len(chunks),
	threshold,
	len(centers),
	avg_chunk_size
	))

	conn.commit()

	elapsed = time.time() - start_time
	self._log_message(f"Processing complete in {elapsed:.2f}s")
	if chunks:
	self._log_message(f" Avg time per chunk: {elapsed/len(chunks)*1000:.2f}ms")
	else:
	self._log_message(" Avg time per chunk: N/A")

	def encode_text(self, text: str) -> np.ndarray:
	"""Encode text using mean pooling.

	This method converts text to embeddings by tokenizing it and then
	taking the mean of the token embeddings.

	Args:
	text: Input text to encode

	Returns:
	Normalized embedding vector
	"""
	tokens = self.tokenizer.encode(text, add_special_tokens=False)
	if not tokens:
	return np.zeros(self.embeddings.shape[1])

	token_embeddings = []
	for token_id in tokens:
	if token_id < len(self.embeddings):
	token_embeddings.append(self.embeddings[token_id])

	if not token_embeddings:
	return np.zeros(self.embeddings.shape[1])

	token_embeddings = np.array(token_embeddings, dtype=np.float32)
	final_embedding = np.mean(token_embeddings, axis=0)
	final_embedding = final_embedding / np.linalg.norm(final_embedding)
	return final_embedding

	def cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
	"""Calculate cosine similarity between two vectors.

	Args:
	a: First vector
	b: Second vector

	Returns:
	Cosine similarity value between -1 and 1
	"""
	return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

	def search(self, query: str, limit: int = 10, min_similarity: float = 0.1) -> List[Dict[str, Any]]:
	"""Search with entropy-aware results.

	This method performs a similarity search for the given query and
	returns the most relevant text chunks based on embedding similarity.

	Args:
	query: Query string to search for
	limit: Maximum number of results to return
	min_similarity: Minimum similarity threshold for results

	Returns:
	List of result dictionaries containing chunk information
	"""
	self._log_message(f"Searching for: '{query}'")

	# Encode query
	query_embedding = self.encode_text(query)

	# Search in database
	start_time = time.time()

	with self.get_connection() as conn:
	cursor = conn.execute("""
	SELECT chunk_id, content, token_start, token_end, center_token, entropy_score,
	radius, adaptive_radius, local_density, fallback_chunk, embedding
	FROM documents
	ORDER BY chunk_id
	""")

	results = []
	for row in cursor:
	# Decode embedding
	doc_embedding = np.frombuffer(row['embedding'], dtype=np.float32)

	# Calculate similarity
	similarity = self.cosine_similarity(query_embedding, doc_embedding)

	if similarity >= min_similarity:
	results.append({
	'chunk_id': row['chunk_id'],
	'content': row['content'][:200] + '...' if len(row['content']) > 200 else row['content'],
	'full_content': row['content'],
	'token_start': row['token_start'],
	'token_end': row['token_end'],
	'center_token': row['center_token'],
	'entropy_score': row['entropy_score'],
	'radius': row['radius'],
	'adaptive_radius': bool(row['adaptive_radius']),
	'local_density': row['local_density'],
	'fallback': bool(row['fallback_chunk']),
	'similarity': similarity
	})

	# Sort by similarity
	results.sort(key=lambda x: x['similarity'], reverse=True)
	results = results[:limit]

	search_time = time.time() - start_time

	# Display results with enhanced chunk info
	self._log_message(f"Found {len(results)} results in {search_time:.3f}s:")
	self._log_message("-" * 80)

	for i, result in enumerate(results):
	chunk_type = "Fallback" if result['center_token'] == 'fallback' else "Radial"
	radius_info = f"radius: {result.get('radius', 'N/A')}" if 'radius' in result else ""
	density_info = f"density: {result.get('local_density', 0.0):.3f}" if 'local_density' in result else ""

	extra_info = []
	if radius_info:
	extra_info.append(radius_info)
	if density_info:
	extra_info.append(density_info)
	if result.get('adaptive_radius', False):
	extra_info.append("adaptive")

	info_str = f" ({', '.join(extra_info)})" if extra_info else ""

	self._log_message(f"{i+1}. [{result['similarity']:.3f}] {chunk_type} Chunk {result['chunk_id']} (center: '{result['center_token']}', entropy: {result['entropy_score']:.2f}){info_str}")
	self._log_message(f" {result['content']}")
	self._log_message("")

	return results

	def get_entropy_stats(self) -> None:
	"""Get entropy analysis statistics.

	This method retrieves and displays statistics about the entropy-based
	chunking process, including document counts, entropy centers, and
	processing information.
	"""
	with self.get_connection() as conn:
	# Document stats
	doc_stats = conn.execute("SELECT COUNT(*) as count FROM documents").fetchone()

	# Entropy analysis
	entropy_stats = conn.execute("""
	SELECT filename, total_tokens, total_chunks, entropy_threshold,
	high_entropy_tokens, avg_chunk_size, processed_at
	FROM entropy_analysis
	ORDER BY processed_at DESC
	LIMIT 1
	""").fetchone()

	# Top entropy centers
	top_centers = conn.execute("""
	SELECT token_text, entropy_score, chunk_assigned
	FROM entropy_centers
	ORDER BY entropy_score DESC
	LIMIT 10
	""").fetchall()

	self._log_message("Entropy-Based Search Statistics:")
	self._log_message(f" Total chunks: {doc_stats['count']:,}")

	if entropy_stats:
	self._log_message(f" File: {entropy_stats['filename']}")
	self._log_message(f" Total tokens: {entropy_stats['total_tokens']:,}")
	self._log_message(f" Entropy centers: {entropy_stats['high_entropy_tokens']:,}")
	self._log_message(f" Entropy threshold: {entropy_stats['entropy_threshold']:.3f}")
	self._log_message(f" Avg chunk size: {entropy_stats['avg_chunk_size']:.1f} tokens")
	self._log_message(f" Processed: {entropy_stats['processed_at']}")

	self._log_message(f"\nTop 10 High-Entropy Centers:")
	for i, center in enumerate(top_centers, 1):
	assigned = "✓" if center['chunk_assigned'] else "✗"
	self._log_message(f" {i:2d}. [{center['entropy_score']:.2f}] '{center['token_text']}' {assigned}")

	def main():
	"""Main function to demonstrate the EntropyRadialSearch functionality.

	This function initializes the search engine, processes a sample file if
	available, and performs test searches to demonstrate the capabilities
	of the entropy-based radial chunking system.
	"""
	print("Entropy-Based Radial Search Engine")
	print("=" * 50)

	# Initialize search engine
	search_engine = EntropyRadialSearch()

	# Check if sam.txt exists
	sam_file = "sam.txt"
	if os.path.exists(sam_file):
	print(f"Found {sam_file} ({os.path.getsize(sam_file)/1024/1024:.1f} MB)")

	# Process with improved entropy-based chunking (adaptive enabled)
	search_engine.process_text_file(sam_file, entropy_percentile=99, chunk_radius=100, enable_adaptive=True)

	# Show stats
	search_engine.get_entropy_stats()

	# Test with some queries
	test_queries = [
	"Steven Lamoreaux Casimir experiment",
	"Jahn Teller effect dynamic",
	"quantum mechanics entanglement",
	"Casimir force measurement 1997"
	]

	print(f"\nTesting entropy-based search:")
	print("-" * 50)

	for query in test_queries:
	search_engine.search(query, limit=3)
	print()

	print("Entropy-Based Search Engine Test Complete!")

	if __name__ == "__main__":
	main()