Spaces:

brendon-ai
/

faq-huggingface-model

Sleeping

App Files Files Community

brendon-ai commited on Jul 1

Commit

d75dc74

verified ·

1 Parent(s): de43d85

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -157

app.py CHANGED Viewed

@@ -2,196 +2,157 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from typing import Optional
-import torch
 import uvicorn
-from transformers import pipeline
 import os
-from contextlib import asynccontextmanager # Import this!
-import sys # Import sys for sys.exit()
-# Optional: For gated models like Llama 3 from Meta, uncomment and configure HF_TOKEN
-# from huggingface_hub import login
-# --- Global variable to store the pipeline ---
-generator = None
-# Choose a model appropriate for free tier (e.g., 7B-8B parameters)
-# For DeepSeek, DeepSeek-V2-Lite-Base (7B) might be loadable, but DeepSeek-V3 is too big.
-MODEL_NAME = "brendon-ai/gemma3-dolly-finetuned"
-#"openai-community/gpt2" # Recommended for free tier
-# --- Lifespan Event Handler ---
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """
-    Handles startup and shutdown events for the FastAPI application.
-    Loads the model on startup and can optionally clean up on shutdown.
-    """
-    global generator
-    try:
-        # --- Optional: Login to Hugging Face Hub for gated models ---
-        # If you are using a gated model (e.g., meta-llama/Llama-3-8B-Instruct),
-        # uncomment the following lines and ensure HF_TOKEN is set as a Space Secret.
-        # hf_token = os.getenv("HF_TOKEN")
-        # if hf_token:
-        #     login(token=hf_token)
-        #     print("Logged into Hugging Face Hub.")
-        # else:
-        #     print("HF_TOKEN not found. Make sure it's set as a Space Secret if using a gated model.")
-        # --- Startup Code: Load the model ---
-        if torch.cuda.is_available():
-            print(f"CUDA is available! Using {torch.cuda.get_device_name(0)}")
-            device = 0 # Use GPU
-            # For larger models, use device_map="auto" and torch_dtype
-            # device_map = "auto"
-            # torch_dtype = torch.bfloat16 # or torch.float16 for GPUs that support it
-        else:
-            print("CUDA not available, using CPU. Inference will be very slow for this model size.")
-            device = -1 # Use CPU
-            # device_map = None
-            # torch_dtype = torch.float32 # Default for CPU
-        print(f"Attempting to load model '{MODEL_NAME}' on device: {'cuda' if device == 0 else 'cpu'}")
-        # The pipeline automatically handles AutoModel and AutoTokenizer.
-        # For better memory management with larger models, directly load with model_kwargs:
-        generator = pipeline(
-            'text-generation',
-            model=MODEL_NAME,
-            device=device,
-            # Pass your HF token to the model loading for gated models
-            # token=os.getenv("HF_TOKEN"), # Uncomment if using a gated model
-            # For 7B models on 16GB GPU, float16 is usually enough, but bfloat16 is better if supported
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            # For more fine-grained control and auto device mapping for multiple GPUs:
-            # model_kwargs={"device_map": "auto", "torch_dtype": torch.float16}
-        )
-        print("Model loaded successfully!")
-        # 'yield' signifies that the startup code has completed, and the application
-        # can now start processing requests.
-        yield
     except Exception as e:
-        print(f"CRITICAL ERROR: Failed to load model during startup: {e}")
-        # Exit with a non-zero code to indicate failure if model loading fails
-        sys.exit(1)
-    finally:
-        # --- Shutdown Code (Optional): Clean up resources ---
-        print("Application shutting down. Any cleanup can go here.")
-# --- Initialize FastAPI application with the lifespan handler ---
-app = FastAPI(lifespan=lifespan, # Use the lifespan context manager
-    title="Text Generation API",
-    description="A simple text generation API using Hugging Face transformers",
-    version="1.0.0"
-)
-# Request model
-class TextGenerationRequest(BaseModel):
-    prompt: str
-    max_new_tokens: Optional[int] = 250 # Changed from max_length for better control
-    num_return_sequences: Optional[int] = 1
-    temperature: Optional[float] = 0.7 # Recommend lower temp for more coherent output
-    do_sample: Optional[bool] = True
-    top_p: Optional[float] = 0.9 # Added top_p for more control
-# Response model
-class TextGenerationResponse(BaseModel):
-    generated_text: str
-    prompt: str
-    model_name: str
 @app.get("/")
 async def root():
     return {
-        "message": "Text Generation API",
-        "status": "running",
         "endpoints": {
-            "generate_post": "/generate", # Renamed for clarity
-            "generate_get": "/generate_simple", # Renamed for clarity
-            "health": "/health",
-            "docs": "/docs"
-        },
-        "current_model": MODEL_NAME
     }
 @app.get("/health")
 async def health_check():
     return {
-        "status": "healthy" if generator else "unhealthy",
-        "model_loaded": generator is not None,
-        "cuda_available": torch.cuda.is_available(),
-        "model_name": MODEL_NAME
     }
-@app.post("/generate", response_model=TextGenerationResponse)
-async def generate_text_post(request: TextGenerationRequest):
-    if generator is None:
-        raise HTTPException(status_code=503, detail="Model not loaded yet. Service unavailable.")
     try:
-        # Generate text
-        result = generator(
-            request.prompt,
-            max_new_tokens=request.max_new_tokens, # Use max_new_tokens
-            num_return_sequences=request.num_return_sequences,
-            temperature=request.temperature,
-            do_sample=request.do_sample,
-            top_p=request.top_p, # Pass top_p
-            pad_token_id=generator.tokenizer.eos_token_id,
-            eos_token_id=generator.tokenizer.eos_token_id,
-            # Add stop sequences relevant to your instruction-tuned model format
-            # stop_sequences=["\nUser:", "\n###", "\n\n"]
-        )
-        generated_text = result[0]['generated_text']
-        return TextGenerationResponse(
-            generated_text=generated_text,
-            prompt=request.prompt,
-            model_name=MODEL_NAME
         )
     except Exception as e:
-        print(f"Generation failed: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}. Check Space logs for details.")
-@app.get("/generate_simple") # Changed endpoint name to avoid conflict with POST
-async def generate_text_get(
-    prompt: str,
-    max_new_tokens: int = 250, # Changed from max_length
-    temperature: float = 0.7
-):
-    """GET endpoint for simple text generation"""
-    if generator is None:
-        raise HTTPException(status_code=503, detail="Model not loaded yet. Service unavailable.")
     try:
-        result = generator(
-            prompt,
-            max_new_tokens=max_new_tokens,
-            num_return_sequences=1,
-            temperature=temperature,
-            do_sample=True,
-            top_p=0.9, # Default top_p for simple GET
-            pad_token_id=generator.tokenizer.eos_token_id,
-            eos_token_id=generator.tokenizer.eos_token_id,
-        )
         return {
-            "generated_text": result[0]['generated_text'],
-            "prompt": prompt,
-            "model_name": MODEL_NAME
         }
     except Exception as e:
-        print(f"Generation failed: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}. Check Space logs for details.")
 if __name__ == "__main__":
-    port = int(os.environ.get("PORT", 7860)) # Hugging Face Spaces uses port 7860
-    uvicorn.run(app, host="0.0.0.0", port=port)

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from typing import Optional
 import uvicorn
+from src.RAGSample import setup_retriever, setup_rag_chain, RAGApplication
 import os
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Create FastAPI app
+app = FastAPI(
+    title="RAG API",
+    description="A REST API for Retrieval-Augmented Generation using local vector database",
+    version="1.0.0"
+)
+# Initialize RAG components (this will be done once when the server starts)
+retriever = None
+rag_chain = None
+rag_application = None
+# Pydantic model for request
+class QuestionRequest(BaseModel):
+    question: str
+# Pydantic model for response
+class QuestionResponse(BaseModel):
+    question: str
+    answer: str
+@app.on_event("startup")
+async def startup_event():
+    """Initialize RAG components when the server starts."""
+    global retriever, rag_chain, rag_application
+    try:
+        print("Initializing RAG components...")
+        # Check if Kaggle credentials are provided via environment variables
+        kaggle_username = os.getenv("KAGGLE_USERNAME")
+        kaggle_key = os.getenv("KAGGLE_KEY")
+        kaggle_dataset = os.getenv("KAGGLE_DATASET")
+        # If no environment variables, try to load from kaggle.json
+        if not (kaggle_username and kaggle_key):
+            try:
+                from src.kaggle_loader import KaggleDataLoader
+                # Test if we can create a loader (this will auto-load from kaggle.json)
+                test_loader = KaggleDataLoader()
+                if test_loader.kaggle_username and test_loader.kaggle_key:
+                    kaggle_username = test_loader.kaggle_username
+                    kaggle_key = test_loader.kaggle_key
+                    print(f"Loaded Kaggle credentials from kaggle.json: {kaggle_username}")
+            except Exception as e:
+                print(f"Could not load Kaggle credentials from kaggle.json: {e}")
+        if kaggle_username and kaggle_key and kaggle_dataset:
+            print(f"Loading Kaggle dataset: {kaggle_dataset}")
+            retriever = setup_retriever(
+                use_kaggle_data=True,
+                kaggle_dataset=kaggle_dataset,
+                kaggle_username=kaggle_username,
+                kaggle_key=kaggle_key
+            )
+        else:
+            print("Loading mental health FAQ data from local file...")
+            # Load mental health FAQ data from local file (default behavior)
+            retriever = setup_retriever()
+        rag_chain = setup_rag_chain()
+        rag_application = RAGApplication(retriever, rag_chain)
+        print("RAG components initialized successfully!")
     except Exception as e:
+        print(f"Error initializing RAG components: {e}")
+        raise
 @app.get("/")
 async def root():
+    """Root endpoint with API information."""
     return {
+        "message": "RAG API is running",
         "endpoints": {
+            "ask_question": "/ask",
+            "health_check": "/health",
+            "load_kaggle_dataset": "/load-kaggle-dataset"
+        }
     }
 @app.get("/health")
 async def health_check():
+    """Health check endpoint."""
     return {
+        "status": "healthy",
+        "rag_initialized": rag_application is not None
     }
+@app.post("/ask", response_model=QuestionResponse)
+async def ask_question(request: QuestionRequest):
+    """Ask a question and get an answer using RAG."""
+    if rag_application is None:
+        raise HTTPException(status_code=500, detail="RAG application not initialized")
     try:
+        print(f"Processing question: {request.question}")
+        # Debug: Check what retriever we're using
+        retriever_type = type(rag_application.retriever).__name__
+        print(f"DEBUG: Using retriever type: {retriever_type}")
+        answer = rag_application.run(request.question)
+        return QuestionResponse(
+            question=request.question,
+            answer=answer
         )
     except Exception as e:
+        print(f"Error processing question: {e}")
+        raise HTTPException(status_code=500, detail=f"Error processing question: {str(e)}")
+@app.post("/load-kaggle-dataset")
+async def load_kaggle_dataset(dataset_name: str):
+    """Load a Kaggle dataset for RAG."""
     try:
+        from src.kaggle_loader import KaggleDataLoader
+        # Create loader without parameters - it will auto-load from kaggle.json
+        loader = KaggleDataLoader()
+        # Download the dataset
+        dataset_path = loader.download_dataset(dataset_name)
+        # Reload the retriever with the new dataset
+        global rag_application
+        retriever = setup_retriever(use_kaggle_data=True, kaggle_dataset=dataset_name)
+        rag_chain = setup_rag_chain()
+        rag_application = RAGApplication(retriever, rag_chain)
         return {
+            "status": "success",
+            "message": f"Dataset {dataset_name} loaded successfully",
+            "dataset_path": dataset_path
         }
     except Exception as e:
+        return {"status": "error", "message": str(e)}
+@app.get("/models")
+async def get_models():
+    """Get information about available models."""
+    return {
+        "llm_model": "dolphin-llama3:8b",
+        "embedding_model": "TF-IDF embeddings",
+        "vector_database": "ChromaDB (local)"
+    }
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)