Spaces:
Sleeping
Sleeping
| import os | |
| from huggingface_hub import hf_hub_download | |
| from langchain.llms import LlamaCpp | |
| from langchain.chains import ConversationalRetrievalChain | |
| from langchain.memory import ConversationBufferMemory | |
| def load_llm(): | |
| """ | |
| Downloads the Q4_K_M GGUF model from mobeidat's Hugging Face repository and loads it via llama-cpp. | |
| """ | |
| # 1) Download the GGUF model from Hugging Face | |
| model_file = hf_hub_download( | |
| repo_id="bartowski/ALLaM-AI_ALLaM-7B-Instruct-preview-GGUF", | |
| filename="ALLaM-AI_ALLaM-7B-Instruct-preview-Q4_K_M.gguf", | |
| local_dir="./models", | |
| local_dir_use_symlinks=False | |
| ) | |
| # 2) Load the model with llama-cpp via LangChain’s LlamaCpp | |
| llm = LlamaCpp( | |
| model_path=model_file, | |
| flash_attn=False, | |
| n_ctx=2048, # or 4096 depending on your needs | |
| n_batch=512, # or even 256 depending on your hardware | |
| chat_format='chatml' | |
| ) | |
| return llm | |
| def build_conversational_chain(vectorstore): | |
| """ | |
| Creates a ConversationalRetrievalChain using the local llama-cpp-based LLM | |
| and a ConversationBufferMemory for multi-turn Q&A. | |
| """ | |
| llm = load_llm() | |
| # We'll store chat history in memory so the chain can handle multi-turn conversations | |
| memory = ConversationBufferMemory( | |
| memory_key="chat_history", | |
| return_messages=True | |
| ) | |
| qa_chain = ConversationalRetrievalChain.from_llm( | |
| llm=llm, | |
| retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}), | |
| memory=memory, | |
| verbose=True | |
| ) | |
| return qa_chain |