Spaces:

luanpoppe
/

vella-backend-tests

Running

App Files Files Community

luanpoppe commited on Jan 28

Commit

e70ffc1

1 Parent(s): 8f3dc39

feat: adicionando possibilidade de requisição com gemini, adicionando possibilidade de requsição com deepseek, colocando como padrão não utilizar o llama parse, mudar o padrão para realizar poucas requisições do contextual e lidar com as respostas contendo vários chunks de uma vez só

Browse files

Files changed (9) hide show

.env.example +4 -1
_utils/LLMs/LLM_class.py +26 -0
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py +190 -84
_utils/gerar_relatorio_modelo_usuario/prompts.py +65 -28
_utils/handle_files.py +7 -3
_utils/resumo_completo_cursor.py +1 -1
_utils/splitters/Splitter_class.py +6 -2
gerar_documento/serializer.py +1 -1
setup/easy_imports.py +1 -0

.env.example CHANGED Viewed

@@ -6,4 +6,7 @@ LANGCHAIN_API_KEY=""
 CLAUDE_API_KEY=""
 COHERE_API_KEY=""
 BUBBLE_TOKEN=""
-LLAMA_CLOUD_API_KEY=""

 CLAUDE_API_KEY=""
 COHERE_API_KEY=""
 BUBBLE_TOKEN=""
+LLAMA_CLOUD_API_KEY_POPS=""
+LLAMA_CLOUD_API_KEY_PEIXE=""
+DEEPSEEKK_API_KEY=""
+GOOGLE_API_KEY_PEIXE=""

_utils/LLMs/LLM_class.py CHANGED Viewed

@@ -1,4 +1,13 @@
 from setup.environment import default_model
 class LLM:
@@ -7,3 +16,20 @@ class LLM:
     # def create_GPT_model(self, model=default_model):
     #     return ChatOpen()

+# from langchain_openai import OpenAI
+from typing import cast
+from openai import OpenAI
+from pydantic import SecretStr
 from setup.environment import default_model
+from setup.easy_imports import ChatOpenAI, ChatGoogleGenerativeAI
+import os
+deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
+google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
 class LLM:
     # def create_GPT_model(self, model=default_model):
     #     return ChatOpen()
+    def deepseek(self):
+        return ChatOpenAI(
+            api_key=SecretStr(deepseek_api_key),
+            base_url="https://api.deepseek.com/v1",
+            model="deepseek-chat",
+        )
+    def googleGemini(self):
+        return ChatGoogleGenerativeAI(
+            api_key=SecretStr(google_api_key),
+            model="gemini-1.5-flash",
+            temperature=0,
+            max_tokens=None,
+            timeout=None,
+            max_retries=2,
+        )

_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from _utils.gerar_relatorio_modelo_usuario.prompts import (
     prompt_auxiliar_do_contextual_prompt,
     create_prompt_auxiliar_do_contextual_prompt,
@@ -11,7 +12,7 @@ from _utils.prompts.Prompt_class import Prompt
 from _utils.splitters.Splitter_class import Splitter
 from setup.easy_imports import PyPDFLoader
 from langchain_openai import ChatOpenAI
-from typing import List, Dict, Tuple, Optional
 from anthropic import Anthropic, AsyncAnthropic
 import logging
 from langchain.schema import Document
@@ -48,101 +49,211 @@ class ContextualRetriever:
         self.bm25 = None
         self.claude_context_model = claude_context_model
-    async def llm_generate_context(
-        self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
-    ) -> str:
-        """Generate contextual description using ChatOpenAI"""
-        try:
-            print("COMEÇOU A REQUISIÇÃO")
-            prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
-            # response = await aclaude_answer(
-            #     self.claude_client, self.claude_context_model, prompt
-            # )
-            response = await agpt_answer(prompt)
-            return response
-        except Exception as e:
-            self.logger.error(
-                f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
-            )
-            return ""
-    # def gerar_resumo_auxiliar_do_contextual_embedding(self):
-    #     prompt = Prompt().create_prompt_template(
-    #         "", prompt_auxiliar_do_contextual_prompt
-    #     )
-    #     Chain(prompt, ChatOpenAI())
-    #     return
-    async def create_contextualized_chunk(
-        self, chunk, single_page_text, response_auxiliar_summary
-    ):
-        lista_contador.append(0)
-        print("contador: ", len(lista_contador))
-        # Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
-        # page_content = ""
-        # for i in range(
-        #     max(0, chunk.page_number - 1),
-        #     min(len(single_page_text), chunk.page_number + 2),
-        # ):
-        #     page_content += single_page_text[i].page_content if single_page_text[i] else ""
-        page_number = chunk.page_number - 1
-        page_content = single_page_text[page_number].page_content
-        context = await self.llm_generate_context(
-            page_content, chunk, response_auxiliar_summary
-        )
-        return ContextualizedChunk(
-            content=chunk.content,
-            page_number=chunk.page_number,
-            chunk_id=chunk.chunk_id,
-            start_char=chunk.start_char,
-            end_char=chunk.end_char,
-            context=context,
-        )
     async def contextualize_all_chunks(
         self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
     ) -> List[ContextualizedChunk]:
         """Add context to all chunks"""
         contextualized_chunks = []
-        lista_contador = []
         full_text = ""
         for x in full_text_as_array:
             full_text += x.page_content
-        # prompt_auxiliar_summary = prompt_obj.create_prompt_template(
-        #     "", prompt_auxiliar_do_contextual_prompt
-        # ).invoke({"PROCESSO_JURIDICO": full_text})
-        # response_auxiliar_summary = await ChatOpenAI(max_tokens=128000).ainvoke(
-        #     prompt_auxiliar_summary
         # )
-        prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
-        print("\n\n\nprompt_auxiliar_summary: ", prompt_auxiliar_summary)
-        response_auxiliar_summary = await aclaude_answer(
-            self.claude_client, self.claude_context_model, prompt_auxiliar_summary
         )
-        print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary)
         async with asyncio.TaskGroup() as tg:
             tasks = [
                 tg.create_task(
                     self.create_contextualized_chunk(
-                        chunk, full_text_as_array, response_auxiliar_summary
                     )
                 )
-                for chunk in chunks
             ]
-        contextualized_chunks = [task.result() for task in tasks]
         return contextualized_chunks
 async def get_full_text_and_all_PDFs_chunks(
     listaPDFs: List[str],
@@ -159,7 +270,9 @@ async def get_full_text_and_all_PDFs_chunks(
             pages = pages + await return_document_list_with_llama_parser(pdf_path)
         else:
             pages = pages + get_pdf_from_bubble(pdf_path)
-        chunks = splitterObject.load_and_split_document(pdf_path, pages)
         all_PDFs_chunks = all_PDFs_chunks + chunks
     # Get full text for contextualization
     # loader = PyPDFLoader(pdf_path)
@@ -170,17 +283,10 @@ async def get_full_text_and_all_PDFs_chunks(
     return all_PDFs_chunks, pages  # , full_text
-async def contextualize_chunk_based_on_serializer(
-    serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
-):
-    if serializer["should_have_contextual_chunks"]:
-        contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
-            pages, all_PDFs_chunks
-        )
-        chunks_passados = contextualized_chunks
-        is_contextualized_chunk = True
-    else:
-        chunks_passados = all_PDFs_chunks
-        is_contextualized_chunk = False
-    return chunks_passados, is_contextualized_chunk

 import os
+from _utils.LLMs.LLM_class import LLM
 from _utils.gerar_relatorio_modelo_usuario.prompts import (
     prompt_auxiliar_do_contextual_prompt,
     create_prompt_auxiliar_do_contextual_prompt,
 from _utils.splitters.Splitter_class import Splitter
 from setup.easy_imports import PyPDFLoader
 from langchain_openai import ChatOpenAI
+from typing import List, Dict, Tuple, Optional, cast
 from anthropic import Anthropic, AsyncAnthropic
 import logging
 from langchain.schema import Document
         self.bm25 = None
         self.claude_context_model = claude_context_model
     async def contextualize_all_chunks(
         self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
     ) -> List[ContextualizedChunk]:
         """Add context to all chunks"""
         contextualized_chunks = []
         full_text = ""
         for x in full_text_as_array:
             full_text += x.page_content
+        prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
+        print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
+        # Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
+        # response_auxiliar_summary = await aclaude_answer(
+        #     self.claude_client, self.claude_context_model, prompt_auxiliar_summary
         # )
+        llms = LLM()
+        response_auxiliar_summary = await llms.googleGemini().ainvoke(
+            [HumanMessage(content=prompt_auxiliar_summary)]
+        )
+        print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary.content)
+        lista_de_listas_cada_com_20_chunks = [
+            chunks[i : i + 20] for i in range(0, len(chunks), 20)
+        ]
+        print(
+            "lista_de_listas_cada_com_20_chunks: ", lista_de_listas_cada_com_20_chunks
         )
         async with asyncio.TaskGroup() as tg:
             tasks = [
                 tg.create_task(
                     self.create_contextualized_chunk(
+                        chunk, full_text_as_array, response_auxiliar_summary.content
                     )
                 )
+                # for chunk in chunks # ORIGINAL
+                for chunk in lista_de_listas_cada_com_20_chunks
             ]
+        # contextualized_chunks = [task.result() for task in tasks]
+        contextualized_chunks = []
+        for task in tasks:
+            # print("\n\ntask", task)
+            # print("\n\ntask.result()", task.result())
+            contextualized_chunks = contextualized_chunks + task.result()
+        print("\n\ncontextualized_chunks", contextualized_chunks)
         return contextualized_chunks
+    # ORIGINAL
+    # async def create_contextualized_chunk(
+    #     self, chunk, single_page_text, response_auxiliar_summary
+    # ):
+    #     lista_contador.append(0)
+    #     print("contador: ", len(lista_contador))
+    #     page_number = chunk.page_number - 1
+    #     page_content = single_page_text[page_number].page_content
+    #     context = await self.llm_generate_context(
+    #         page_content, chunk, response_auxiliar_summary
+    #     )
+    #     print("context: ", context)
+    #     return ContextualizedChunk(
+    #         content=chunk.content,
+    #         page_number=chunk.page_number,
+    #         chunk_id=chunk.chunk_id,
+    #         start_char=chunk.start_char,
+    #         end_char=chunk.end_char,
+    #         context=context,
+    #     )
+    async def create_contextualized_chunk(
+        self, chunks: List[DocumentChunk], single_page_text, response_auxiliar_summary
+    ):
+        lista_contador.append(0)
+        print("contador: ", len(lista_contador))
+        all_pages_contents = ""
+        contador = 1
+        for chunk in chunks:
+            page_number = chunk.page_number - 1
+            page_content = single_page_text[page_number].page_content
+            all_pages_contents += page_content
+            contador += 1
+        context = await self.llm_generate_context(
+            page_content, chunks, response_auxiliar_summary
+        )
+        context = (
+            context.replace("document_id: ", "")
+            .replace("document_id:", "")
+            .replace("DOCUMENT_ID: ", "")
+            .replace("DOCUMENT_ID: ", "")
+        )
+        # print("context: ", context)
+        import re
+        pattern = r"\[(\d+)\] --- (.+?) --- (.+?)</chunk_context>"  # Funciona para quando a resposta do LLM não vem com "document_id" escrito
+        # pattern = r"\[\s*(?:document_id:\s*)?(\d+)\s*\] --- \[document_title:\s*(.+?)\s*\] --- \[(.+?)\]"
+        matches = re.findall(pattern, context, re.DOTALL)
+        # Convert matches to the desired format
+        result = [
+            [int(doc_id), title.strip(), content.strip()]
+            for doc_id, title, content in matches
+        ]
+        # print("\n\nresult", result)
+        lista_chunks = []
+        for index, chunk in enumerate(chunks):
+            lista_chunks.append(
+                ContextualizedChunk(
+                    content=chunk.content,
+                    page_number=chunk.page_number,
+                    chunk_id=result[index][0],
+                    start_char=chunk.start_char,
+                    end_char=chunk.end_char,
+                    context=" ".join(result[index][1:2]),
+                )
+            )
+        return lista_chunks
+    # ORIGINAL
+    # async def llm_generate_context(
+    #     self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
+    # ) -> str:
+    #     """Generate contextual description using ChatOpenAI"""
+    #     try:
+    #         print("COMEÇOU A REQUISIÇÃO")
+    #         prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
+    #         # response = await aclaude_answer(
+    #         #     self.claude_client, self.claude_context_model, prompt
+    #         # )
+    #         # response = await agpt_answer(prompt)
+    #         llms = LLM()
+    #         response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
+    #         return cast(str, response.content)
+    #     except Exception as e:
+    #         self.logger.error(
+    #             f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
+    #         )
+    #         return ""
+    async def llm_generate_context(
+        self, page_text: str, chunks: List[DocumentChunk], resumo_auxiliar
+    ) -> str:
+        """Generate contextual description using ChatOpenAI"""
+        contador = 1
+        all_chunks_contents = ""
+        for chunk in chunks:
+            all_chunks_contents += chunk.content
+            all_chunks_contents += f"\n\n CHUNK {contador}:\n"
+            contador += 1
+        try:
+            print("COMEÇOU A REQUISIÇÃO")
+            prompt = contextual_prompt(page_text, resumo_auxiliar, all_chunks_contents)
+            # response = await aclaude_answer(
+            #     self.claude_client, self.claude_context_model, prompt
+            # )
+            response = await agpt_answer(prompt)
+            # llms = LLM()
+            # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
+            # return cast(str, response.content)
+            return cast(str, response)
+        except Exception as e:
+            self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
+            return ""
+    # def gerar_resumo_auxiliar_do_contextual_embedding(self):
+    #     prompt = Prompt().create_prompt_template(
+    #         "", prompt_auxiliar_do_contextual_prompt
+    #     )
+    #     Chain(prompt, ChatOpenAI())
+    #     return
+# Primeira função chamada do arquivo
+async def contextualize_chunk_based_on_serializer(
+    serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
+):
+    if serializer["should_have_contextual_chunks"]:
+        contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
+            pages, all_PDFs_chunks
+        )
+        chunks_passados = contextualized_chunks
+        is_contextualized_chunk = True
+    else:
+        chunks_passados = all_PDFs_chunks
+        is_contextualized_chunk = False
+    return chunks_passados, is_contextualized_chunk
 async def get_full_text_and_all_PDFs_chunks(
     listaPDFs: List[str],
             pages = pages + await return_document_list_with_llama_parser(pdf_path)
         else:
             pages = pages + get_pdf_from_bubble(pdf_path)
+        chunks = splitterObject.load_and_split_document(
+            pdf_path, pages, should_use_llama_parse
+        )
         all_PDFs_chunks = all_PDFs_chunks + chunks
     # Get full text for contextualization
     # loader = PyPDFLoader(pdf_path)
     return all_PDFs_chunks, pages  # , full_text
+# Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
+# page_content = ""
+# for i in range(
+#     max(0, chunk.page_number - 1),
+#     min(len(single_page_text), chunk.page_number + 2),
+# ):
+#     page_content += single_page_text[i].page_content if single_page_text[i] else ""

_utils/gerar_relatorio_modelo_usuario/prompts.py CHANGED Viewed

@@ -117,40 +117,77 @@ Formate sua resposta da seguinte maneira:
 </resumo_final>"""
-def contextual_prompt(single_page_text, summary_text, chunk_content):
-    return f"""You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze a chunk of text from a larger document and provide a brief context for it.
 Here's the summary of the full text of the document:
 <summary_text>
 {summary_text}
 </summary_text>
-Here's the single page where the chunk is situated:
-<single_page>
-{single_page_text}
-</single_page>
-And here's the specific chunk to contextualize:
-<chunk>
-{chunk_content}
-</chunk>
-Follow these steps:
-1. Identify and quote the document ID (found between "NUM." and "- Pág") and the document name (from the header).
-2. Summarize the main topics or themes of the single page and where it fit within the summary of the full text.
 3. Identify where the specific chunk fits within these themes.
 4. Create a concise context that situates the chunk within the document.
-With this informations, your response should be a single, concise paragraph that includes:
-- The document ID
-- The document name
-- A brief context for the chunk
-Example final output structure (do not copy the content, only the format):
-<chunk_context>
-[Single paragraph with document ID, name, and chunk context]
-</chunk_context>"""
 #     return f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):

 </resumo_final>"""
+# ORIGINAL
+# def contextual_prompt(single_page_text, summary_text, chunk_content):
+#     return f"""You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze a chunk of text from a larger document and provide a brief context for it.
+# Here's the summary of the full text of the document:
+# <summary_text>
+# {summary_text}
+# </summary_text>
+# Here's the single page where the chunk is situated:
+# <single_page>
+# {single_page_text}
+# </single_page>
+# And here's the specific chunk to contextualize:
+# <chunk>
+# {chunk_content}
+# </chunk>
+# Follow these steps:
+# 1. Identify and quote the document ID (found between "NUM." and "- Pág") and the document name (from the header).
+# 2. Summarize the main topics or themes of the single page and where it fit within the summary of the full text.
+# 3. Identify where the specific chunk fits within these themes.
+# 4. Create a concise context that situates the chunk within the document.
+# With this informations, your response should be a single, concise paragraph that includes:
+# - The document ID
+# - The document name
+# - A brief context for the chunk
+# Example final output structure (do not copy the content, only the format):
+# <chunk_context>
+# [Single paragraph with document ID, name, and chunk context]
+# </chunk_context>"""
+def contextual_prompt(all_pages_contents, summary_text, chunk_content):
+    return f"""
+You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze multiple chunks of text from a larger document and provide brief contexts for each of them.
 Here's the summary of the full text of the document:
 <summary_text>
 {summary_text}
 </summary_text>
+Here are the pages where the chunks are situated:
+<page>
+{all_pages_contents}
+</page>
+You will be given 20 specific chunks to contextualize. For each chunk, follow these steps:
+1. Identify the document ID (found between "NUM." and "- Pág") and the document name (from the header).
+2. Summarize the main topics or themes of the single page and how they relate to the summary of the full text.
 3. Identify where the specific chunk fits within these themes.
 4. Create a concise context that situates the chunk within the document.
+Your final output should be a numbered list of 20 chunk contexts, each containing a single, concise paragraph that includes:
+<final_output>
+[document_id] --- [document_name] --- [brief_context_for_the_chunk]
+</final_output>
+Here are the 20 chunks to analyze:
+<user_input>
+{chunk_content}
+</user_input>
+Example output structure (do not copy the content, only the format):
+1. <chunk_context>
+[document_id] --- [document_title] --- [brief_context_for_the_chunk]
+</chunk_context>
+2.<chunk_context>
+[document_id] --- [document_title] --- [brief_context_for_the_chunk]
+</chunk_context>
+[Continue for all 20 chunks]
+Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
+"""
 #     return f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):

_utils/handle_files.py CHANGED Viewed

@@ -19,7 +19,7 @@ def handle_pdf_files_from_serializer(files):
                 temp_file.write(chunk)
             temp_file_path = temp_file.name  # Get the path of the temporary file
             listaPDFs.append(temp_file_path)
-    print("listaPDFs: ", listaPDFs)
     return listaPDFs
@@ -29,7 +29,7 @@ def remove_pdf_temp_files(listaPDFs):
 async def return_document_list_with_llama_parser(file: str):
-    llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY")
     documents: List[LangchainDocument] = []
     if llama_parser_api:
         parser = LlamaParse(
@@ -39,7 +39,11 @@ async def return_document_list_with_llama_parser(file: str):
             verbose=True,
         )
-        parsed_document = await parser.aget_json(file)
         for doc in parsed_document[0].get("pages"):  # type: ignore
             # documents.append(doc.to_langchain_format())

                 temp_file.write(chunk)
             temp_file_path = temp_file.name  # Get the path of the temporary file
             listaPDFs.append(temp_file_path)
+    print("\n\nlistaPDFs: ", listaPDFs)
     return listaPDFs
 async def return_document_list_with_llama_parser(file: str):
+    llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY_POPS")
     documents: List[LangchainDocument] = []
     if llama_parser_api:
         parser = LlamaParse(
             verbose=True,
         )
+        try:
+            parsed_document = await parser.aget_json(file)
+        except:
+            raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")
+        print("parsed_document: ", parsed_document)
         for doc in parsed_document[0].get("pages"):  # type: ignore
             # documents.append(doc.to_langchain_format())

_utils/resumo_completo_cursor.py CHANGED Viewed

@@ -105,7 +105,7 @@ async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
     for x in structured_summaries:
         texto_completo = texto_completo + x["content"] + "\n"
-    print("\n\ntexto_completo: ", texto_completo)
     return {
         "resultado": structured_summaries,

     for x in structured_summaries:
         texto_completo = texto_completo + x["content"] + "\n"
+    print("\n\ntexto_completo[0: 1000]: ", texto_completo[0:1000])
     return {
         "resultado": structured_summaries,

_utils/splitters/Splitter_class.py CHANGED Viewed

@@ -19,7 +19,7 @@ class Splitter:
         self.chunk_metadata = {}  # Store chunk metadata for tracing
     def load_and_split_document(
-        self, pdf_path: str, pages: List[Document] | None
     ) -> List[DocumentChunk]:
         """Load PDF and split into chunks with metadata"""
         # loader = PyPDFLoader(pdf_path)
@@ -43,10 +43,14 @@ class Splitter:
                 )  # Retorna a posição onde se encontra o chunk dentro da página inteira
                 end_char = start_char + len(chunk)
                 doc_chunk = DocumentChunk(  # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
                     content=chunk,
                     page_number=cast(int, page.metadata.get("page"))
-                    + 1,  # 1-based page numbering
                     chunk_id=chunk_id,
                     start_char=char_count + start_char,
                     end_char=char_count + end_char,

         self.chunk_metadata = {}  # Store chunk metadata for tracing
     def load_and_split_document(
+        self, pdf_path: str, pages: List[Document] | None, should_use_llama_parse: bool
     ) -> List[DocumentChunk]:
         """Load PDF and split into chunks with metadata"""
         # loader = PyPDFLoader(pdf_path)
                 )  # Retorna a posição onde se encontra o chunk dentro da página inteira
                 end_char = start_char + len(chunk)
+                if should_use_llama_parse:
+                    somar_pages = 0
+                else:
+                    somar_pages = 1
                 doc_chunk = DocumentChunk(  # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
                     content=chunk,
                     page_number=cast(int, page.metadata.get("page"))
+                    + somar_pages,  # 1-based page numbering
                     chunk_id=chunk_id,
                     start_char=char_count + start_char,
                     end_char=char_count + end_char,

gerar_documento/serializer.py CHANGED Viewed

@@ -73,4 +73,4 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
     gpt_temperature = serializers.FloatField(default=0)
     id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
     should_have_contextual_chunks = serializers.BooleanField(default=False)  # type: ignore
-    should_use_llama_parse = serializers.BooleanField(required=False, default=True)  # type: ignore

     gpt_temperature = serializers.FloatField(default=0)
     id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
     should_have_contextual_chunks = serializers.BooleanField(default=False)  # type: ignore
+    should_use_llama_parse = serializers.BooleanField(required=False, default=False)  # type: ignore

setup/easy_imports.py CHANGED Viewed

@@ -14,6 +14,7 @@ from langchain.prompts import PromptTemplate
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.vectorstores import Chroma
 # from langchain_community.chat_models import ChatOpenAI
 from langchain_openai import ChatOpenAI

 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.vectorstores import Chroma
+from langchain_google_genai import ChatGoogleGenerativeAI
 # from langchain_community.chat_models import ChatOpenAI
 from langchain_openai import ChatOpenAI