Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

Daniel.y commited on Jun 11

Commit

132bd6e

unverified ·

2 Parent(s): b05bfe9 6f663b5

Merge pull request #1673 from danielaskdd/litellm-problem

Browse files

feat: Support `application/octet-stream` requests from LiteLLM clients for Ollama Emulation

Files changed (2) hide show

lightrag/api/__init__.py +1 -1
lightrag/api/routers/ollama_api.py +111 -8

lightrag/api/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __api_version__ = "~~0172~~"


1	+ __api_version__ = "0173"

lightrag/api/routers/ollama_api.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from fastapi import APIRouter, HTTPException, Request
 from pydantic import BaseModel
-from typing import List, Dict, Any, Optional
-import logging
 import time
 import json
 import re
@@ -95,6 +95,68 @@ class OllamaTagResponse(BaseModel):
     models: List[OllamaModel]
 def estimate_tokens(text: str) -> int:
     """Estimate the number of tokens in text using tiktoken"""
     tokens = TiktokenTokenizer().encode(text)
@@ -197,13 +259,43 @@ class OllamaAPI:
                 ]
             )
-        @self.router.post("/generate", dependencies=[Depends(combined_auth)])
-        async def generate(raw_request: Request, request: OllamaGenerateRequest):
             """Handle generate completion requests acting as an Ollama model
             For compatibility purpose, the request is not processed by LightRAG,
             and will be handled by underlying LLM model.
             """
             try:
                 query = request.prompt
                 start_time = time.time_ns()
                 prompt_tokens = estimate_tokens(query)
@@ -278,7 +370,7 @@ class OllamaAPI:
                                     else:
                                         error_msg = f"Provider error: {error_msg}"
-                                    logging.error(f"Stream error: {error_msg}")
                                     # Send error message to client
                                     error_data = {
@@ -363,13 +455,19 @@ class OllamaAPI:
                 trace_exception(e)
                 raise HTTPException(status_code=500, detail=str(e))
-        @self.router.post("/chat", dependencies=[Depends(combined_auth)])
-        async def chat(raw_request: Request, request: OllamaChatRequest):
             """Process chat completion requests acting as an Ollama model
             Routes user queries through LightRAG by selecting query mode based on prefix indicators.
             Detects and forwards OpenWebUI session-related requests (for meta data generation task) directly to LLM.
             """
             try:
                 # Get all messages
                 messages = request.messages
                 if not messages:
@@ -496,7 +594,7 @@ class OllamaAPI:
                                     else:
                                         error_msg = f"Provider error: {error_msg}"
-                                    logging.error(f"Stream error: {error_msg}")
                                     # Send error message to client
                                     error_data = {
@@ -530,6 +628,11 @@ class OllamaAPI:
                                 data = {
                                     "model": self.ollama_server_infos.LIGHTRAG_MODEL,
                                     "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
                                     "done": True,
                                     "total_duration": total_time,
                                     "load_duration": 0,

 from fastapi import APIRouter, HTTPException, Request
 from pydantic import BaseModel
+from typing import List, Dict, Any, Optional, Type
+from lightrag.utils import logger
 import time
 import json
 import re
     models: List[OllamaModel]
+class OllamaRunningModelDetails(BaseModel):
+    parent_model: str
+    format: str
+    family: str
+    families: List[str]
+    parameter_size: str
+    quantization_level: str
+class OllamaRunningModel(BaseModel):
+    name: str
+    model: str
+    size: int
+    digest: str
+    details: OllamaRunningModelDetails
+    expires_at: str
+    size_vram: int
+class OllamaPsResponse(BaseModel):
+    models: List[OllamaRunningModel]
+async def parse_request_body(
+    request: Request, model_class: Type[BaseModel]
+) -> BaseModel:
+    """
+    Parse request body based on Content-Type header.
+    Supports both application/json and application/octet-stream.
+    Args:
+        request: The FastAPI Request object
+        model_class: The Pydantic model class to parse the request into
+    Returns:
+        An instance of the provided model_class
+    """
+    content_type = request.headers.get("content-type", "").lower()
+    try:
+        if content_type.startswith("application/json"):
+            # FastAPI already handles JSON parsing for us
+            body = await request.json()
+        elif content_type.startswith("application/octet-stream"):
+            # Manually parse octet-stream as JSON
+            body_bytes = await request.body()
+            body = json.loads(body_bytes.decode("utf-8"))
+        else:
+            # Try to parse as JSON for any other content type
+            body_bytes = await request.body()
+            body = json.loads(body_bytes.decode("utf-8"))
+        # Create an instance of the model
+        return model_class(**body)
+    except json.JSONDecodeError:
+        raise HTTPException(status_code=400, detail="Invalid JSON in request body")
+    except Exception as e:
+        raise HTTPException(
+            status_code=400, detail=f"Error parsing request body: {str(e)}"
+        )
 def estimate_tokens(text: str) -> int:
     """Estimate the number of tokens in text using tiktoken"""
     tokens = TiktokenTokenizer().encode(text)
                 ]
             )
+        @self.router.get("/ps", dependencies=[Depends(combined_auth)])
+        async def get_running_models():
+            """List Running Models - returns currently running models"""
+            return OllamaPsResponse(
+                models=[
+                    {
+                        "name": self.ollama_server_infos.LIGHTRAG_MODEL,
+                        "model": self.ollama_server_infos.LIGHTRAG_MODEL,
+                        "size": self.ollama_server_infos.LIGHTRAG_SIZE,
+                        "digest": self.ollama_server_infos.LIGHTRAG_DIGEST,
+                        "details": {
+                            "parent_model": "",
+                            "format": "gguf",
+                            "family": "llama",
+                            "families": ["llama"],
+                            "parameter_size": "7.2B",
+                            "quantization_level": "Q4_0",
+                        },
+                        "expires_at": "2050-12-31T14:38:31.83753-07:00",
+                        "size_vram": self.ollama_server_infos.LIGHTRAG_SIZE,
+                    }
+                ]
+            )
+        @self.router.post(
+            "/generate", dependencies=[Depends(combined_auth)], include_in_schema=True
+        )
+        async def generate(raw_request: Request):
             """Handle generate completion requests acting as an Ollama model
             For compatibility purpose, the request is not processed by LightRAG,
             and will be handled by underlying LLM model.
+            Supports both application/json and application/octet-stream Content-Types.
             """
             try:
+                # Parse the request body manually
+                request = await parse_request_body(raw_request, OllamaGenerateRequest)
                 query = request.prompt
                 start_time = time.time_ns()
                 prompt_tokens = estimate_tokens(query)
                                     else:
                                         error_msg = f"Provider error: {error_msg}"
+                                    logger.error(f"Stream error: {error_msg}")
                                     # Send error message to client
                                     error_data = {
                 trace_exception(e)
                 raise HTTPException(status_code=500, detail=str(e))
+        @self.router.post(
+            "/chat", dependencies=[Depends(combined_auth)], include_in_schema=True
+        )
+        async def chat(raw_request: Request):
             """Process chat completion requests acting as an Ollama model
             Routes user queries through LightRAG by selecting query mode based on prefix indicators.
             Detects and forwards OpenWebUI session-related requests (for meta data generation task) directly to LLM.
+            Supports both application/json and application/octet-stream Content-Types.
             """
             try:
+                # Parse the request body manually
+                request = await parse_request_body(raw_request, OllamaChatRequest)
                 # Get all messages
                 messages = request.messages
                 if not messages:
                                     else:
                                         error_msg = f"Provider error: {error_msg}"
+                                    logger.error(f"Stream error: {error_msg}")
                                     # Send error message to client
                                     error_data = {
                                 data = {
                                     "model": self.ollama_server_infos.LIGHTRAG_MODEL,
                                     "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
+                                    "message": {
+                                        "role": "assistant",
+                                        "content": "",
+                                        "images": None,
+                                    },
                                     "done": True,
                                     "total_duration": total_time,
                                     "load_duration": 0,