import hashlib from typing import List from config import Config from utils.normalizer import Normalizer from langchain_core.documents import Document from langchain.text_splitter import RecursiveCharacterTextSplitter class DocumentChunker: def __init__(self): self.splitter = RecursiveCharacterTextSplitter( chunk_size=Config.CHUNK_SIZE, chunk_overlap=Config.CHUNK_OVERLAP ) self.existing_hashes = set() self.normalizer = Normalizer() def hash_text(self, text: str) -> str: return hashlib.md5(text.encode('utf-8')).hexdigest() def split_documents(self, docs: List[Document]) -> List[dict]: """Split and deduplicate documents. Returns list of dicts with id, text, metadata.""" chunks = self.splitter.split_documents(docs) results = [] for i, chunk in enumerate(chunks): normalized_text = self.normalizer.normalize_text(chunk.page_content) if not normalized_text: continue chunk_hash = self.hash_text(normalized_text) if chunk_hash in self.existing_hashes: continue self.existing_hashes.add(chunk_hash) results.append({ "id": int(chunk_hash, 16) % (10 ** 9), "text": normalized_text, "metadata": { **chunk.metadata, "chunk_order": i # Preserve order } }) return results if __name__ == "__main__": sample_docs = [ Document( page_content="This is a long document that needs to be split into smaller pieces.", metadata={"source": "example.txt"} ) ] chunker = DocumentChunker() chunks = chunker.split_documents(sample_docs) for i, cnk in enumerate(chunks): print(f"#### Chunk {i}: {cnk['text']}")