File size: 7,432 Bytes
d971a8d
 
 
 
 
 
0a3b5e1
 
d971a8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7d7e0
0a3b5e1
d971a8d
 
 
 
0a3b5e1
 
 
 
 
 
 
 
d971a8d
 
 
 
0a3b5e1
 
 
 
d971a8d
 
0a3b5e1
d971a8d
0a3b5e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d971a8d
 
0a3b5e1
 
 
 
 
 
 
d971a8d
0a3b5e1
 
 
 
 
d971a8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a3b5e1
 
 
 
 
 
 
d971a8d
 
 
 
 
8b7d7e0
d971a8d
 
 
 
 
 
 
0a3b5e1
 
 
 
d971a8d
 
 
 
92343a7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
from fastapi import APIRouter, File, UploadFile, HTTPException, Depends
from bson.objectid import ObjectId
import os
import PyPDF2
from io import BytesIO
from datetime import datetime
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

from auth import get_admin_user
from database import get_db
from config import SAVE_FOLDER
from chat import embedding_model

router = APIRouter(prefix="/api/admin", tags=["Administration"])
db=get_db()

@router.post("/knowledge/upload")
async def upload_pdf(
    file: UploadFile = File(...),
    title: str = None,
    tags: str = None,
    current_user: dict = Depends(get_admin_user)
):
    try:
        if not file.filename.endswith('.pdf'):
            raise HTTPException(status_code=400, detail="Le fichier doit être un PDF")
        
        contents = await file.read()
        pdf_file = BytesIO(contents)
        
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text_content = ""
        for page_num in range(len(pdf_reader.pages)):
            text_content += pdf_reader.pages[page_num].extract_text() + "\n"
        
        doc_id = ObjectId()
        
        pdf_path = f"/tmp/{str(doc_id)}.pdf"
        os.makedirs("files", exist_ok=True)
        with open(pdf_path, "wb") as f:
            pdf_file.seek(0)
            f.write(contents)
        
        print(f"Découpage du document '{title or file.filename}' en chunks...")
        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        
        doc = Document(page_content=text_content, metadata={"title": title or file.filename})
        chunks = splitter.split_documents([doc])
        print(f"{len(chunks)} morceaux extraits.")
        
        main_document = {
            "_id": doc_id,
            "title": title or file.filename,
            "tags": tags.split(",") if tags else [],
            "uploaded_by": str(current_user["_id"]),
            "upload_date": datetime.utcnow(),
            "is_parent": True,
            "chunk_count": len(chunks),
            "file_path": pdf_path
        }
        
        db.connaissances.insert_one(main_document)
        
        inserted_chunks = 0
        chunk_ids = []
        
        for i, chunk in enumerate(chunks):
            try:
                chunk_text = chunk.page_content
                if len(chunk_text) > 5000:  
                    chunk_text = chunk_text[:5000]
                
                embedding = None
                if embedding_model:
                    try:
                        embedding = embedding_model.embed_query(chunk_text)
                    except Exception as e:
                        print(f"Erreur lors de la génération de l'embedding pour le morceau {i+1}: {str(e)}")
                
                chunk_id = ObjectId()
                chunk_doc = {
                    "_id": chunk_id,
                    "parent_id": doc_id,
                    "text": chunk_text,
                    "embedding": embedding,
                    "title": f"{title or file.filename} - Partie {i+1}",
                    "tags": tags.split(",") if tags else [],
                    "chunk_index": i,
                    "uploaded_by": str(current_user["_id"]),
                    "upload_date": datetime.utcnow(),
                    "is_chunk": True
                }
                
                db.connaissances.insert_one(chunk_doc)
                chunk_ids.append(str(chunk_id))
                inserted_chunks += 1
                
                print(f"Morceau {i+1}/{len(chunks)} inséré.")
            except Exception as chunk_error:
                print(f"Erreur lors du traitement du morceau {i+1}: {str(chunk_error)}")
        
        db.connaissances.update_one(
            {"_id": doc_id},
            {"$set": {"chunk_ids": chunk_ids, "inserted_chunks": inserted_chunks}}
        )
        
        # Vérification
        verification = db.connaissances.find_one({"_id": doc_id})
        if verification:
            print(f"Document parent vérifié et trouvé dans la base de données avec {inserted_chunks} chunks")
            return {
                "success": True, 
                "document_id": str(doc_id),
                "chunks_total": len(chunks),
                "chunks_inserted": inserted_chunks
            }
        else:
            print(f"ERREUR: Document parent non trouvé après insertion")
            return {
                "success": False, 
                "error": "Document parent non trouvé après insertion"
            }
        
    except Exception as e:
        import traceback
        print(f"Erreur lors de l'upload du PDF: {traceback.format_exc()}")
        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

@router.get("/knowledge")
async def list_documents(current_user: dict = Depends(get_admin_user)):
    try:
        documents = list(db.connaissances.find().sort("upload_date", -1))
        
        result = []
        for doc in documents:
            doc_safe = {
                "id": str(doc["_id"]),
                "title": doc.get("title", "Sans titre"),
                "tags": doc.get("tags", []),
                "date": doc.get("upload_date").isoformat() if "upload_date" in doc else None,
                "text_preview": doc.get("text", "")[:100] + "..." if len(doc.get("text", "")) > 100 else doc.get("text", "")
            }
            result.append(doc_safe)
        
        return {"documents": result}
    except Exception as e:
        print(f"Erreur lors de la liste des documents: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Erreur: {str(e)}")

@router.delete("/knowledge/{document_id}")
async def delete_document(document_id: str, current_user: dict = Depends(get_admin_user)):
    try:
        try:
            doc_id = ObjectId(document_id)
        except Exception:
            raise HTTPException(status_code=400, detail="ID de document invalide")
        
        document = db.connaissances.find_one({"_id": doc_id})
        if not document:
            raise HTTPException(status_code=404, detail="Document non trouvé")
        
        chunks_deleted = 0
        if document.get("is_parent", False):
            # Supprimer tous les chunks liés à ce parent
            chunks_result = db.connaissances.delete_many({"parent_id": doc_id})
            chunks_deleted = chunks_result.deleted_count
            print(f"Suppression de {chunks_deleted} chunks associés au document {document_id}")
        
        result = db.connaissances.delete_one({"_id": doc_id})
        
        if result.deleted_count == 0:
            raise HTTPException(status_code=500, detail="Échec de la suppression du document")
        
        pdf_path = f"/tmp/{document_id}.pdf"
        if os.path.exists(pdf_path):
            try:
                os.remove(pdf_path)
                print(f"Fichier supprimé: {pdf_path}")
            except Exception as e:
                print(f"Erreur lors de la suppression du fichier: {str(e)}")
        
        return {
            "success": True, 
            "message": f"Document supprimé avec succès, ainsi que {chunks_deleted} chunks associés"
        }
        
    except HTTPException as he:
        raise he
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Erreur lors de la suppression: {str(e)}")