Spaces:
Running
Running
luanpoppe
commited on
Commit
·
e70ffc1
1
Parent(s):
8f3dc39
feat: adicionando possibilidade de requisição com gemini, adicionando possibilidade de requsição com deepseek, colocando como padrão não utilizar o llama parse, mudar o padrão para realizar poucas requisições do contextual e lidar com as respostas contendo vários chunks de uma vez só
Browse files- .env.example +4 -1
- _utils/LLMs/LLM_class.py +26 -0
- _utils/gerar_relatorio_modelo_usuario/contextual_retriever.py +190 -84
- _utils/gerar_relatorio_modelo_usuario/prompts.py +65 -28
- _utils/handle_files.py +7 -3
- _utils/resumo_completo_cursor.py +1 -1
- _utils/splitters/Splitter_class.py +6 -2
- gerar_documento/serializer.py +1 -1
- setup/easy_imports.py +1 -0
.env.example
CHANGED
|
@@ -6,4 +6,7 @@ LANGCHAIN_API_KEY=""
|
|
| 6 |
CLAUDE_API_KEY=""
|
| 7 |
COHERE_API_KEY=""
|
| 8 |
BUBBLE_TOKEN=""
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
CLAUDE_API_KEY=""
|
| 7 |
COHERE_API_KEY=""
|
| 8 |
BUBBLE_TOKEN=""
|
| 9 |
+
LLAMA_CLOUD_API_KEY_POPS=""
|
| 10 |
+
LLAMA_CLOUD_API_KEY_PEIXE=""
|
| 11 |
+
DEEPSEEKK_API_KEY=""
|
| 12 |
+
GOOGLE_API_KEY_PEIXE=""
|
_utils/LLMs/LLM_class.py
CHANGED
|
@@ -1,4 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from setup.environment import default_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
class LLM:
|
|
@@ -7,3 +16,20 @@ class LLM:
|
|
| 7 |
|
| 8 |
# def create_GPT_model(self, model=default_model):
|
| 9 |
# return ChatOpen()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from langchain_openai import OpenAI
|
| 2 |
+
from typing import cast
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
from pydantic import SecretStr
|
| 5 |
from setup.environment import default_model
|
| 6 |
+
from setup.easy_imports import ChatOpenAI, ChatGoogleGenerativeAI
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
|
| 10 |
+
google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
|
| 11 |
|
| 12 |
|
| 13 |
class LLM:
|
|
|
|
| 16 |
|
| 17 |
# def create_GPT_model(self, model=default_model):
|
| 18 |
# return ChatOpen()
|
| 19 |
+
|
| 20 |
+
def deepseek(self):
|
| 21 |
+
return ChatOpenAI(
|
| 22 |
+
api_key=SecretStr(deepseek_api_key),
|
| 23 |
+
base_url="https://api.deepseek.com/v1",
|
| 24 |
+
model="deepseek-chat",
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
def googleGemini(self):
|
| 28 |
+
return ChatGoogleGenerativeAI(
|
| 29 |
+
api_key=SecretStr(google_api_key),
|
| 30 |
+
model="gemini-1.5-flash",
|
| 31 |
+
temperature=0,
|
| 32 |
+
max_tokens=None,
|
| 33 |
+
timeout=None,
|
| 34 |
+
max_retries=2,
|
| 35 |
+
)
|
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
| 3 |
from _utils.gerar_relatorio_modelo_usuario.prompts import (
|
| 4 |
prompt_auxiliar_do_contextual_prompt,
|
| 5 |
create_prompt_auxiliar_do_contextual_prompt,
|
|
@@ -11,7 +12,7 @@ from _utils.prompts.Prompt_class import Prompt
|
|
| 11 |
from _utils.splitters.Splitter_class import Splitter
|
| 12 |
from setup.easy_imports import PyPDFLoader
|
| 13 |
from langchain_openai import ChatOpenAI
|
| 14 |
-
from typing import List, Dict, Tuple, Optional
|
| 15 |
from anthropic import Anthropic, AsyncAnthropic
|
| 16 |
import logging
|
| 17 |
from langchain.schema import Document
|
|
@@ -48,101 +49,211 @@ class ContextualRetriever:
|
|
| 48 |
self.bm25 = None
|
| 49 |
self.claude_context_model = claude_context_model
|
| 50 |
|
| 51 |
-
async def llm_generate_context(
|
| 52 |
-
self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
|
| 53 |
-
) -> str:
|
| 54 |
-
"""Generate contextual description using ChatOpenAI"""
|
| 55 |
-
try:
|
| 56 |
-
print("COMEÇOU A REQUISIÇÃO")
|
| 57 |
-
prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
|
| 58 |
-
# response = await aclaude_answer(
|
| 59 |
-
# self.claude_client, self.claude_context_model, prompt
|
| 60 |
-
# )
|
| 61 |
-
|
| 62 |
-
response = await agpt_answer(prompt)
|
| 63 |
-
return response
|
| 64 |
-
except Exception as e:
|
| 65 |
-
self.logger.error(
|
| 66 |
-
f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
|
| 67 |
-
)
|
| 68 |
-
return ""
|
| 69 |
-
|
| 70 |
-
# def gerar_resumo_auxiliar_do_contextual_embedding(self):
|
| 71 |
-
# prompt = Prompt().create_prompt_template(
|
| 72 |
-
# "", prompt_auxiliar_do_contextual_prompt
|
| 73 |
-
# )
|
| 74 |
-
# Chain(prompt, ChatOpenAI())
|
| 75 |
-
# return
|
| 76 |
-
|
| 77 |
-
async def create_contextualized_chunk(
|
| 78 |
-
self, chunk, single_page_text, response_auxiliar_summary
|
| 79 |
-
):
|
| 80 |
-
lista_contador.append(0)
|
| 81 |
-
print("contador: ", len(lista_contador))
|
| 82 |
-
# Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
|
| 83 |
-
# page_content = ""
|
| 84 |
-
# for i in range(
|
| 85 |
-
# max(0, chunk.page_number - 1),
|
| 86 |
-
# min(len(single_page_text), chunk.page_number + 2),
|
| 87 |
-
# ):
|
| 88 |
-
# page_content += single_page_text[i].page_content if single_page_text[i] else ""
|
| 89 |
-
page_number = chunk.page_number - 1
|
| 90 |
-
page_content = single_page_text[page_number].page_content
|
| 91 |
-
|
| 92 |
-
context = await self.llm_generate_context(
|
| 93 |
-
page_content, chunk, response_auxiliar_summary
|
| 94 |
-
)
|
| 95 |
-
return ContextualizedChunk(
|
| 96 |
-
content=chunk.content,
|
| 97 |
-
page_number=chunk.page_number,
|
| 98 |
-
chunk_id=chunk.chunk_id,
|
| 99 |
-
start_char=chunk.start_char,
|
| 100 |
-
end_char=chunk.end_char,
|
| 101 |
-
context=context,
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
async def contextualize_all_chunks(
|
| 105 |
self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
|
| 106 |
) -> List[ContextualizedChunk]:
|
| 107 |
"""Add context to all chunks"""
|
| 108 |
contextualized_chunks = []
|
| 109 |
-
lista_contador = []
|
| 110 |
full_text = ""
|
| 111 |
for x in full_text_as_array:
|
| 112 |
full_text += x.page_content
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
|
| 118 |
-
#
|
| 119 |
-
#
|
|
|
|
| 120 |
# )
|
| 121 |
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
print("\n\n\
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
| 128 |
)
|
| 129 |
|
| 130 |
-
print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary)
|
| 131 |
-
|
| 132 |
async with asyncio.TaskGroup() as tg:
|
| 133 |
tasks = [
|
| 134 |
tg.create_task(
|
| 135 |
self.create_contextualized_chunk(
|
| 136 |
-
chunk, full_text_as_array, response_auxiliar_summary
|
| 137 |
)
|
| 138 |
)
|
| 139 |
-
for chunk in chunks
|
|
|
|
| 140 |
]
|
| 141 |
|
| 142 |
-
contextualized_chunks = [task.result() for task in tasks]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
|
|
|
| 144 |
return contextualized_chunks
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
async def get_full_text_and_all_PDFs_chunks(
|
| 148 |
listaPDFs: List[str],
|
|
@@ -159,7 +270,9 @@ async def get_full_text_and_all_PDFs_chunks(
|
|
| 159 |
pages = pages + await return_document_list_with_llama_parser(pdf_path)
|
| 160 |
else:
|
| 161 |
pages = pages + get_pdf_from_bubble(pdf_path)
|
| 162 |
-
chunks = splitterObject.load_and_split_document(
|
|
|
|
|
|
|
| 163 |
all_PDFs_chunks = all_PDFs_chunks + chunks
|
| 164 |
# Get full text for contextualization
|
| 165 |
# loader = PyPDFLoader(pdf_path)
|
|
@@ -170,17 +283,10 @@ async def get_full_text_and_all_PDFs_chunks(
|
|
| 170 |
return all_PDFs_chunks, pages # , full_text
|
| 171 |
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
chunks_passados = contextualized_chunks
|
| 181 |
-
is_contextualized_chunk = True
|
| 182 |
-
else:
|
| 183 |
-
chunks_passados = all_PDFs_chunks
|
| 184 |
-
is_contextualized_chunk = False
|
| 185 |
-
|
| 186 |
-
return chunks_passados, is_contextualized_chunk
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
from _utils.LLMs.LLM_class import LLM
|
| 4 |
from _utils.gerar_relatorio_modelo_usuario.prompts import (
|
| 5 |
prompt_auxiliar_do_contextual_prompt,
|
| 6 |
create_prompt_auxiliar_do_contextual_prompt,
|
|
|
|
| 12 |
from _utils.splitters.Splitter_class import Splitter
|
| 13 |
from setup.easy_imports import PyPDFLoader
|
| 14 |
from langchain_openai import ChatOpenAI
|
| 15 |
+
from typing import List, Dict, Tuple, Optional, cast
|
| 16 |
from anthropic import Anthropic, AsyncAnthropic
|
| 17 |
import logging
|
| 18 |
from langchain.schema import Document
|
|
|
|
| 49 |
self.bm25 = None
|
| 50 |
self.claude_context_model = claude_context_model
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
async def contextualize_all_chunks(
|
| 53 |
self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
|
| 54 |
) -> List[ContextualizedChunk]:
|
| 55 |
"""Add context to all chunks"""
|
| 56 |
contextualized_chunks = []
|
|
|
|
| 57 |
full_text = ""
|
| 58 |
for x in full_text_as_array:
|
| 59 |
full_text += x.page_content
|
| 60 |
|
| 61 |
+
prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
|
| 62 |
+
|
| 63 |
+
print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
|
| 64 |
|
| 65 |
+
# Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
|
| 66 |
+
# response_auxiliar_summary = await aclaude_answer(
|
| 67 |
+
# self.claude_client, self.claude_context_model, prompt_auxiliar_summary
|
| 68 |
# )
|
| 69 |
|
| 70 |
+
llms = LLM()
|
| 71 |
+
response_auxiliar_summary = await llms.googleGemini().ainvoke(
|
| 72 |
+
[HumanMessage(content=prompt_auxiliar_summary)]
|
| 73 |
+
)
|
| 74 |
|
| 75 |
+
print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary.content)
|
| 76 |
|
| 77 |
+
lista_de_listas_cada_com_20_chunks = [
|
| 78 |
+
chunks[i : i + 20] for i in range(0, len(chunks), 20)
|
| 79 |
+
]
|
| 80 |
+
print(
|
| 81 |
+
"lista_de_listas_cada_com_20_chunks: ", lista_de_listas_cada_com_20_chunks
|
| 82 |
)
|
| 83 |
|
|
|
|
|
|
|
| 84 |
async with asyncio.TaskGroup() as tg:
|
| 85 |
tasks = [
|
| 86 |
tg.create_task(
|
| 87 |
self.create_contextualized_chunk(
|
| 88 |
+
chunk, full_text_as_array, response_auxiliar_summary.content
|
| 89 |
)
|
| 90 |
)
|
| 91 |
+
# for chunk in chunks # ORIGINAL
|
| 92 |
+
for chunk in lista_de_listas_cada_com_20_chunks
|
| 93 |
]
|
| 94 |
|
| 95 |
+
# contextualized_chunks = [task.result() for task in tasks]
|
| 96 |
+
contextualized_chunks = []
|
| 97 |
+
for task in tasks:
|
| 98 |
+
# print("\n\ntask", task)
|
| 99 |
+
# print("\n\ntask.result()", task.result())
|
| 100 |
+
|
| 101 |
+
contextualized_chunks = contextualized_chunks + task.result()
|
| 102 |
|
| 103 |
+
print("\n\ncontextualized_chunks", contextualized_chunks)
|
| 104 |
return contextualized_chunks
|
| 105 |
|
| 106 |
+
# ORIGINAL
|
| 107 |
+
# async def create_contextualized_chunk(
|
| 108 |
+
# self, chunk, single_page_text, response_auxiliar_summary
|
| 109 |
+
# ):
|
| 110 |
+
# lista_contador.append(0)
|
| 111 |
+
# print("contador: ", len(lista_contador))
|
| 112 |
+
# page_number = chunk.page_number - 1
|
| 113 |
+
# page_content = single_page_text[page_number].page_content
|
| 114 |
+
|
| 115 |
+
# context = await self.llm_generate_context(
|
| 116 |
+
# page_content, chunk, response_auxiliar_summary
|
| 117 |
+
# )
|
| 118 |
+
# print("context: ", context)
|
| 119 |
+
# return ContextualizedChunk(
|
| 120 |
+
# content=chunk.content,
|
| 121 |
+
# page_number=chunk.page_number,
|
| 122 |
+
# chunk_id=chunk.chunk_id,
|
| 123 |
+
# start_char=chunk.start_char,
|
| 124 |
+
# end_char=chunk.end_char,
|
| 125 |
+
# context=context,
|
| 126 |
+
# )
|
| 127 |
+
|
| 128 |
+
async def create_contextualized_chunk(
|
| 129 |
+
self, chunks: List[DocumentChunk], single_page_text, response_auxiliar_summary
|
| 130 |
+
):
|
| 131 |
+
|
| 132 |
+
lista_contador.append(0)
|
| 133 |
+
print("contador: ", len(lista_contador))
|
| 134 |
+
all_pages_contents = ""
|
| 135 |
+
contador = 1
|
| 136 |
+
for chunk in chunks:
|
| 137 |
+
page_number = chunk.page_number - 1
|
| 138 |
+
page_content = single_page_text[page_number].page_content
|
| 139 |
+
|
| 140 |
+
all_pages_contents += page_content
|
| 141 |
+
contador += 1
|
| 142 |
+
|
| 143 |
+
context = await self.llm_generate_context(
|
| 144 |
+
page_content, chunks, response_auxiliar_summary
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
context = (
|
| 148 |
+
context.replace("document_id: ", "")
|
| 149 |
+
.replace("document_id:", "")
|
| 150 |
+
.replace("DOCUMENT_ID: ", "")
|
| 151 |
+
.replace("DOCUMENT_ID: ", "")
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# print("context: ", context)
|
| 155 |
+
import re
|
| 156 |
+
|
| 157 |
+
pattern = r"\[(\d+)\] --- (.+?) --- (.+?)</chunk_context>" # Funciona para quando a resposta do LLM não vem com "document_id" escrito
|
| 158 |
+
# pattern = r"\[\s*(?:document_id:\s*)?(\d+)\s*\] --- \[document_title:\s*(.+?)\s*\] --- \[(.+?)\]"
|
| 159 |
+
matches = re.findall(pattern, context, re.DOTALL)
|
| 160 |
+
|
| 161 |
+
# Convert matches to the desired format
|
| 162 |
+
result = [
|
| 163 |
+
[int(doc_id), title.strip(), content.strip()]
|
| 164 |
+
for doc_id, title, content in matches
|
| 165 |
+
]
|
| 166 |
+
# print("\n\nresult", result)
|
| 167 |
+
|
| 168 |
+
lista_chunks = []
|
| 169 |
+
for index, chunk in enumerate(chunks):
|
| 170 |
+
lista_chunks.append(
|
| 171 |
+
ContextualizedChunk(
|
| 172 |
+
content=chunk.content,
|
| 173 |
+
page_number=chunk.page_number,
|
| 174 |
+
chunk_id=result[index][0],
|
| 175 |
+
start_char=chunk.start_char,
|
| 176 |
+
end_char=chunk.end_char,
|
| 177 |
+
context=" ".join(result[index][1:2]),
|
| 178 |
+
)
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
return lista_chunks
|
| 182 |
+
|
| 183 |
+
# ORIGINAL
|
| 184 |
+
# async def llm_generate_context(
|
| 185 |
+
# self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
|
| 186 |
+
# ) -> str:
|
| 187 |
+
# """Generate contextual description using ChatOpenAI"""
|
| 188 |
+
# try:
|
| 189 |
+
# print("COMEÇOU A REQUISIÇÃO")
|
| 190 |
+
# prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
|
| 191 |
+
# # response = await aclaude_answer(
|
| 192 |
+
# # self.claude_client, self.claude_context_model, prompt
|
| 193 |
+
# # )
|
| 194 |
+
|
| 195 |
+
# # response = await agpt_answer(prompt)
|
| 196 |
+
# llms = LLM()
|
| 197 |
+
# response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
|
| 198 |
+
# return cast(str, response.content)
|
| 199 |
+
# except Exception as e:
|
| 200 |
+
# self.logger.error(
|
| 201 |
+
# f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
|
| 202 |
+
# )
|
| 203 |
+
# return ""
|
| 204 |
+
|
| 205 |
+
async def llm_generate_context(
|
| 206 |
+
self, page_text: str, chunks: List[DocumentChunk], resumo_auxiliar
|
| 207 |
+
) -> str:
|
| 208 |
+
"""Generate contextual description using ChatOpenAI"""
|
| 209 |
+
contador = 1
|
| 210 |
+
all_chunks_contents = ""
|
| 211 |
+
|
| 212 |
+
for chunk in chunks:
|
| 213 |
+
all_chunks_contents += chunk.content
|
| 214 |
+
all_chunks_contents += f"\n\n CHUNK {contador}:\n"
|
| 215 |
+
contador += 1
|
| 216 |
+
|
| 217 |
+
try:
|
| 218 |
+
print("COMEÇOU A REQUISIÇÃO")
|
| 219 |
+
prompt = contextual_prompt(page_text, resumo_auxiliar, all_chunks_contents)
|
| 220 |
+
# response = await aclaude_answer(
|
| 221 |
+
# self.claude_client, self.claude_context_model, prompt
|
| 222 |
+
# )
|
| 223 |
+
|
| 224 |
+
response = await agpt_answer(prompt)
|
| 225 |
+
# llms = LLM()
|
| 226 |
+
# response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
|
| 227 |
+
# return cast(str, response.content)
|
| 228 |
+
return cast(str, response)
|
| 229 |
+
except Exception as e:
|
| 230 |
+
self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
|
| 231 |
+
return ""
|
| 232 |
+
|
| 233 |
+
# def gerar_resumo_auxiliar_do_contextual_embedding(self):
|
| 234 |
+
# prompt = Prompt().create_prompt_template(
|
| 235 |
+
# "", prompt_auxiliar_do_contextual_prompt
|
| 236 |
+
# )
|
| 237 |
+
# Chain(prompt, ChatOpenAI())
|
| 238 |
+
# return
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# Primeira função chamada do arquivo
|
| 242 |
+
async def contextualize_chunk_based_on_serializer(
|
| 243 |
+
serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
|
| 244 |
+
):
|
| 245 |
+
if serializer["should_have_contextual_chunks"]:
|
| 246 |
+
contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
|
| 247 |
+
pages, all_PDFs_chunks
|
| 248 |
+
)
|
| 249 |
+
chunks_passados = contextualized_chunks
|
| 250 |
+
is_contextualized_chunk = True
|
| 251 |
+
else:
|
| 252 |
+
chunks_passados = all_PDFs_chunks
|
| 253 |
+
is_contextualized_chunk = False
|
| 254 |
+
|
| 255 |
+
return chunks_passados, is_contextualized_chunk
|
| 256 |
+
|
| 257 |
|
| 258 |
async def get_full_text_and_all_PDFs_chunks(
|
| 259 |
listaPDFs: List[str],
|
|
|
|
| 270 |
pages = pages + await return_document_list_with_llama_parser(pdf_path)
|
| 271 |
else:
|
| 272 |
pages = pages + get_pdf_from_bubble(pdf_path)
|
| 273 |
+
chunks = splitterObject.load_and_split_document(
|
| 274 |
+
pdf_path, pages, should_use_llama_parse
|
| 275 |
+
)
|
| 276 |
all_PDFs_chunks = all_PDFs_chunks + chunks
|
| 277 |
# Get full text for contextualization
|
| 278 |
# loader = PyPDFLoader(pdf_path)
|
|
|
|
| 283 |
return all_PDFs_chunks, pages # , full_text
|
| 284 |
|
| 285 |
|
| 286 |
+
# Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
|
| 287 |
+
# page_content = ""
|
| 288 |
+
# for i in range(
|
| 289 |
+
# max(0, chunk.page_number - 1),
|
| 290 |
+
# min(len(single_page_text), chunk.page_number + 2),
|
| 291 |
+
# ):
|
| 292 |
+
# page_content += single_page_text[i].page_content if single_page_text[i] else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_utils/gerar_relatorio_modelo_usuario/prompts.py
CHANGED
|
@@ -117,40 +117,77 @@ Formate sua resposta da seguinte maneira:
|
|
| 117 |
</resumo_final>"""
|
| 118 |
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
Here's the summary of the full text of the document:
|
| 124 |
<summary_text>
|
| 125 |
{summary_text}
|
| 126 |
</summary_text>
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
And here's the specific chunk to contextualize:
|
| 135 |
-
<chunk>
|
| 136 |
-
{chunk_content}
|
| 137 |
-
</chunk>
|
| 138 |
-
|
| 139 |
-
Follow these steps:
|
| 140 |
-
1. Identify and quote the document ID (found between "NUM." and "- Pág") and the document name (from the header).
|
| 141 |
-
2. Summarize the main topics or themes of the single page and where it fit within the summary of the full text.
|
| 142 |
3. Identify where the specific chunk fits within these themes.
|
| 143 |
4. Create a concise context that situates the chunk within the document.
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
|
| 156 |
# return f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):
|
|
|
|
| 117 |
</resumo_final>"""
|
| 118 |
|
| 119 |
|
| 120 |
+
# ORIGINAL
|
| 121 |
+
# def contextual_prompt(single_page_text, summary_text, chunk_content):
|
| 122 |
+
# return f"""You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze a chunk of text from a larger document and provide a brief context for it.
|
| 123 |
+
|
| 124 |
+
# Here's the summary of the full text of the document:
|
| 125 |
+
# <summary_text>
|
| 126 |
+
# {summary_text}
|
| 127 |
+
# </summary_text>
|
| 128 |
+
|
| 129 |
+
# Here's the single page where the chunk is situated:
|
| 130 |
+
|
| 131 |
+
# <single_page>
|
| 132 |
+
# {single_page_text}
|
| 133 |
+
# </single_page>
|
| 134 |
+
|
| 135 |
+
# And here's the specific chunk to contextualize:
|
| 136 |
+
# <chunk>
|
| 137 |
+
# {chunk_content}
|
| 138 |
+
# </chunk>
|
| 139 |
+
|
| 140 |
+
# Follow these steps:
|
| 141 |
+
# 1. Identify and quote the document ID (found between "NUM." and "- Pág") and the document name (from the header).
|
| 142 |
+
# 2. Summarize the main topics or themes of the single page and where it fit within the summary of the full text.
|
| 143 |
+
# 3. Identify where the specific chunk fits within these themes.
|
| 144 |
+
# 4. Create a concise context that situates the chunk within the document.
|
| 145 |
+
|
| 146 |
+
# With this informations, your response should be a single, concise paragraph that includes:
|
| 147 |
+
# - The document ID
|
| 148 |
+
# - The document name
|
| 149 |
+
# - A brief context for the chunk
|
| 150 |
+
|
| 151 |
+
# Example final output structure (do not copy the content, only the format):
|
| 152 |
+
# <chunk_context>
|
| 153 |
+
# [Single paragraph with document ID, name, and chunk context]
|
| 154 |
+
# </chunk_context>"""
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def contextual_prompt(all_pages_contents, summary_text, chunk_content):
|
| 158 |
+
return f"""
|
| 159 |
+
You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze multiple chunks of text from a larger document and provide brief contexts for each of them.
|
| 160 |
Here's the summary of the full text of the document:
|
| 161 |
<summary_text>
|
| 162 |
{summary_text}
|
| 163 |
</summary_text>
|
| 164 |
+
Here are the pages where the chunks are situated:
|
| 165 |
+
<page>
|
| 166 |
+
{all_pages_contents}
|
| 167 |
+
</page>
|
| 168 |
+
You will be given 20 specific chunks to contextualize. For each chunk, follow these steps:
|
| 169 |
+
1. Identify the document ID (found between "NUM." and "- Pág") and the document name (from the header).
|
| 170 |
+
2. Summarize the main topics or themes of the single page and how they relate to the summary of the full text.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
3. Identify where the specific chunk fits within these themes.
|
| 172 |
4. Create a concise context that situates the chunk within the document.
|
| 173 |
+
Your final output should be a numbered list of 20 chunk contexts, each containing a single, concise paragraph that includes:
|
| 174 |
+
<final_output>
|
| 175 |
+
[document_id] --- [document_name] --- [brief_context_for_the_chunk]
|
| 176 |
+
</final_output>
|
| 177 |
+
Here are the 20 chunks to analyze:
|
| 178 |
+
<user_input>
|
| 179 |
+
{chunk_content}
|
| 180 |
+
</user_input>
|
| 181 |
+
Example output structure (do not copy the content, only the format):
|
| 182 |
+
1. <chunk_context>
|
| 183 |
+
[document_id] --- [document_title] --- [brief_context_for_the_chunk]
|
| 184 |
+
</chunk_context>
|
| 185 |
+
2.<chunk_context>
|
| 186 |
+
[document_id] --- [document_title] --- [brief_context_for_the_chunk]
|
| 187 |
+
</chunk_context>
|
| 188 |
+
[Continue for all 20 chunks]
|
| 189 |
+
Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
|
| 190 |
+
"""
|
| 191 |
|
| 192 |
|
| 193 |
# return f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):
|
_utils/handle_files.py
CHANGED
|
@@ -19,7 +19,7 @@ def handle_pdf_files_from_serializer(files):
|
|
| 19 |
temp_file.write(chunk)
|
| 20 |
temp_file_path = temp_file.name # Get the path of the temporary file
|
| 21 |
listaPDFs.append(temp_file_path)
|
| 22 |
-
print("
|
| 23 |
return listaPDFs
|
| 24 |
|
| 25 |
|
|
@@ -29,7 +29,7 @@ def remove_pdf_temp_files(listaPDFs):
|
|
| 29 |
|
| 30 |
|
| 31 |
async def return_document_list_with_llama_parser(file: str):
|
| 32 |
-
llama_parser_api = os.getenv("
|
| 33 |
documents: List[LangchainDocument] = []
|
| 34 |
if llama_parser_api:
|
| 35 |
parser = LlamaParse(
|
|
@@ -39,7 +39,11 @@ async def return_document_list_with_llama_parser(file: str):
|
|
| 39 |
verbose=True,
|
| 40 |
)
|
| 41 |
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
for doc in parsed_document[0].get("pages"): # type: ignore
|
| 44 |
# documents.append(doc.to_langchain_format())
|
| 45 |
|
|
|
|
| 19 |
temp_file.write(chunk)
|
| 20 |
temp_file_path = temp_file.name # Get the path of the temporary file
|
| 21 |
listaPDFs.append(temp_file_path)
|
| 22 |
+
print("\n\nlistaPDFs: ", listaPDFs)
|
| 23 |
return listaPDFs
|
| 24 |
|
| 25 |
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
async def return_document_list_with_llama_parser(file: str):
|
| 32 |
+
llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY_POPS")
|
| 33 |
documents: List[LangchainDocument] = []
|
| 34 |
if llama_parser_api:
|
| 35 |
parser = LlamaParse(
|
|
|
|
| 39 |
verbose=True,
|
| 40 |
)
|
| 41 |
|
| 42 |
+
try:
|
| 43 |
+
parsed_document = await parser.aget_json(file)
|
| 44 |
+
except:
|
| 45 |
+
raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")
|
| 46 |
+
print("parsed_document: ", parsed_document)
|
| 47 |
for doc in parsed_document[0].get("pages"): # type: ignore
|
| 48 |
# documents.append(doc.to_langchain_format())
|
| 49 |
|
_utils/resumo_completo_cursor.py
CHANGED
|
@@ -105,7 +105,7 @@ async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
|
|
| 105 |
for x in structured_summaries:
|
| 106 |
texto_completo = texto_completo + x["content"] + "\n"
|
| 107 |
|
| 108 |
-
print("\n\ntexto_completo: ", texto_completo)
|
| 109 |
|
| 110 |
return {
|
| 111 |
"resultado": structured_summaries,
|
|
|
|
| 105 |
for x in structured_summaries:
|
| 106 |
texto_completo = texto_completo + x["content"] + "\n"
|
| 107 |
|
| 108 |
+
print("\n\ntexto_completo[0: 1000]: ", texto_completo[0:1000])
|
| 109 |
|
| 110 |
return {
|
| 111 |
"resultado": structured_summaries,
|
_utils/splitters/Splitter_class.py
CHANGED
|
@@ -19,7 +19,7 @@ class Splitter:
|
|
| 19 |
self.chunk_metadata = {} # Store chunk metadata for tracing
|
| 20 |
|
| 21 |
def load_and_split_document(
|
| 22 |
-
self, pdf_path: str, pages: List[Document] | None
|
| 23 |
) -> List[DocumentChunk]:
|
| 24 |
"""Load PDF and split into chunks with metadata"""
|
| 25 |
# loader = PyPDFLoader(pdf_path)
|
|
@@ -43,10 +43,14 @@ class Splitter:
|
|
| 43 |
) # Retorna a posição onde se encontra o chunk dentro da página inteira
|
| 44 |
end_char = start_char + len(chunk)
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
| 47 |
content=chunk,
|
| 48 |
page_number=cast(int, page.metadata.get("page"))
|
| 49 |
-
+
|
| 50 |
chunk_id=chunk_id,
|
| 51 |
start_char=char_count + start_char,
|
| 52 |
end_char=char_count + end_char,
|
|
|
|
| 19 |
self.chunk_metadata = {} # Store chunk metadata for tracing
|
| 20 |
|
| 21 |
def load_and_split_document(
|
| 22 |
+
self, pdf_path: str, pages: List[Document] | None, should_use_llama_parse: bool
|
| 23 |
) -> List[DocumentChunk]:
|
| 24 |
"""Load PDF and split into chunks with metadata"""
|
| 25 |
# loader = PyPDFLoader(pdf_path)
|
|
|
|
| 43 |
) # Retorna a posição onde se encontra o chunk dentro da página inteira
|
| 44 |
end_char = start_char + len(chunk)
|
| 45 |
|
| 46 |
+
if should_use_llama_parse:
|
| 47 |
+
somar_pages = 0
|
| 48 |
+
else:
|
| 49 |
+
somar_pages = 1
|
| 50 |
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
| 51 |
content=chunk,
|
| 52 |
page_number=cast(int, page.metadata.get("page"))
|
| 53 |
+
+ somar_pages, # 1-based page numbering
|
| 54 |
chunk_id=chunk_id,
|
| 55 |
start_char=char_count + start_char,
|
| 56 |
end_char=char_count + end_char,
|
gerar_documento/serializer.py
CHANGED
|
@@ -73,4 +73,4 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
|
|
| 73 |
gpt_temperature = serializers.FloatField(default=0)
|
| 74 |
id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
|
| 75 |
should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
|
| 76 |
-
should_use_llama_parse = serializers.BooleanField(required=False, default=
|
|
|
|
| 73 |
gpt_temperature = serializers.FloatField(default=0)
|
| 74 |
id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
|
| 75 |
should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
|
| 76 |
+
should_use_llama_parse = serializers.BooleanField(required=False, default=False) # type: ignore
|
setup/easy_imports.py
CHANGED
|
@@ -14,6 +14,7 @@ from langchain.prompts import PromptTemplate
|
|
| 14 |
from langchain_core.prompts import ChatPromptTemplate
|
| 15 |
from langchain_community.document_loaders import PyPDFLoader
|
| 16 |
from langchain_community.vectorstores import Chroma
|
|
|
|
| 17 |
|
| 18 |
# from langchain_community.chat_models import ChatOpenAI
|
| 19 |
from langchain_openai import ChatOpenAI
|
|
|
|
| 14 |
from langchain_core.prompts import ChatPromptTemplate
|
| 15 |
from langchain_community.document_loaders import PyPDFLoader
|
| 16 |
from langchain_community.vectorstores import Chroma
|
| 17 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 18 |
|
| 19 |
# from langchain_community.chat_models import ChatOpenAI
|
| 20 |
from langchain_openai import ChatOpenAI
|