Spaces:
Running
Running
luanpoppe
commited on
Commit
·
39fc36b
1
Parent(s):
78209bc
feat: começandoa a adicionar testes com pytest
Browse files- _utils/handle_files.py +5 -6
- requirements.txt +0 -0
- tests/bubble_integrations/test_obter_arquivo.py +28 -0
- tests/conftest.py +9 -0
- tests/fixtures/_pdf-uma-pagina.pdf +0 -0
- tests/test_handle_files.py +28 -0
- tests/test_splitters.py +82 -0
_utils/handle_files.py
CHANGED
|
@@ -4,6 +4,11 @@ from langchain_core.documents import Document as LangchainDocument
|
|
| 4 |
from llama_index import Document
|
| 5 |
from llama_parse import LlamaParse, ResultType
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def handle_pdf_files_from_serializer(files):
|
| 9 |
listaPDFs = []
|
|
@@ -28,11 +33,6 @@ def remove_pdf_temp_files(listaPDFs):
|
|
| 28 |
|
| 29 |
|
| 30 |
async def return_document_list_with_llama_parser(file: str):
|
| 31 |
-
llama_parser_keys = [
|
| 32 |
-
os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
|
| 33 |
-
os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
|
| 34 |
-
]
|
| 35 |
-
|
| 36 |
for key in llama_parser_keys:
|
| 37 |
documents: List[LangchainDocument] = []
|
| 38 |
if key:
|
|
@@ -48,7 +48,6 @@ async def return_document_list_with_llama_parser(file: str):
|
|
| 48 |
except:
|
| 49 |
print(f"Error with llama parser key ending with {key[-4:]}")
|
| 50 |
continue # Faz com que comece o próximo loop
|
| 51 |
-
print("parsed_document: ", parsed_document)
|
| 52 |
if len(parsed_document) == 0:
|
| 53 |
continue
|
| 54 |
|
|
|
|
| 4 |
from llama_index import Document
|
| 5 |
from llama_parse import LlamaParse, ResultType
|
| 6 |
|
| 7 |
+
llama_parser_keys = [
|
| 8 |
+
os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
|
| 9 |
+
os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
|
| 13 |
def handle_pdf_files_from_serializer(files):
|
| 14 |
listaPDFs = []
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
async def return_document_list_with_llama_parser(file: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
for key in llama_parser_keys:
|
| 37 |
documents: List[LangchainDocument] = []
|
| 38 |
if key:
|
|
|
|
| 48 |
except:
|
| 49 |
print(f"Error with llama parser key ending with {key[-4:]}")
|
| 50 |
continue # Faz com que comece o próximo loop
|
|
|
|
| 51 |
if len(parsed_document) == 0:
|
| 52 |
continue
|
| 53 |
|
requirements.txt
CHANGED
|
Binary files a/requirements.txt and b/requirements.txt differ
|
|
|
tests/bubble_integrations/test_obter_arquivo.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
| 3 |
+
from langchain_core.documents import Document
|
| 4 |
+
|
| 5 |
+
pdf_file_url = "https://vella.app.br/version-test/fileupload/f1736298232170x993758712541722200/0002269-86.2009.8.05.0032%20processo%20teste.pdf"
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TestObterArquivo:
|
| 9 |
+
@pytest.mark.asyncio
|
| 10 |
+
async def test_get_pdf_from_bubble_No_llama_parse(self):
|
| 11 |
+
should_use_llama_parse = False
|
| 12 |
+
result = await get_pdf_from_bubble(pdf_file_url, should_use_llama_parse)
|
| 13 |
+
|
| 14 |
+
assert isinstance(result, list)
|
| 15 |
+
assert len(result) > 0
|
| 16 |
+
print("\n\nresult", result)
|
| 17 |
+
assert all(isinstance(item, Document) for item in result)
|
| 18 |
+
|
| 19 |
+
# Teste abaixo não funciona com arquivos grandes -> O Llama Parse dá erro de timeout
|
| 20 |
+
# @pytest.mark.asyncio
|
| 21 |
+
# async def test_get_pdf_from_bubble_With_llama_parse(self):
|
| 22 |
+
# should_use_llama_parse = True
|
| 23 |
+
# result = await get_pdf_from_bubble(pdf_file_url, should_use_llama_parse)
|
| 24 |
+
|
| 25 |
+
# assert isinstance(result, list)
|
| 26 |
+
# assert len(result) > 0
|
| 27 |
+
# print("\n\nresult", result)
|
| 28 |
+
# assert all(isinstance(item, Document) for item in result)
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import django
|
| 4 |
+
|
| 5 |
+
# Configura o Django
|
| 6 |
+
# Adiciona o diretório raiz do projeto ao sys.path
|
| 7 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
| 8 |
+
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "setup.settings")
|
| 9 |
+
django.setup()
|
tests/fixtures/_pdf-uma-pagina.pdf
ADDED
|
Binary file (26.6 kB). View file
|
|
|
tests/test_handle_files.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
import os
|
| 3 |
+
from langchain_core.documents import Document
|
| 4 |
+
|
| 5 |
+
from _utils.handle_files import return_document_list_with_llama_parser
|
| 6 |
+
|
| 7 |
+
cwd = os.getcwd()
|
| 8 |
+
pdf_file_url = os.path.join(cwd, "tests", "fixtures", "_pdf-uma-pagina.pdf")
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TestHandleFiles:
|
| 12 |
+
@pytest.mark.asyncio
|
| 13 |
+
async def test_return_document_list_with_llama_parser_With_wrong_keys(
|
| 14 |
+
self, monkeypatch
|
| 15 |
+
):
|
| 16 |
+
|
| 17 |
+
monkeypatch.setattr(
|
| 18 |
+
"_utils.handle_files.llama_parser_keys",
|
| 19 |
+
["abc", os.getenv("LLAMA_CLOUD_API_KEY_PEIXE")],
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
result = await return_document_list_with_llama_parser(pdf_file_url)
|
| 23 |
+
|
| 24 |
+
assert isinstance(result, list)
|
| 25 |
+
assert len(result) > 0
|
| 26 |
+
assert all(isinstance(item, Document) for item in result)
|
| 27 |
+
assert all(len(item.page_content) > 0 for item in result)
|
| 28 |
+
assert all(int(item.metadata.get("page", 0)) > 0 for item in result)
|
tests/test_splitters.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
import os
|
| 3 |
+
from _utils.splitters.Splitter_class import Splitter
|
| 4 |
+
from _utils.models.gerar_relatorio import (
|
| 5 |
+
DocumentChunk,
|
| 6 |
+
)
|
| 7 |
+
|
| 8 |
+
base_dir = os.path.dirname(os.path.abspath(__file__))
|
| 9 |
+
chunk_size = 1000
|
| 10 |
+
chunk_overlap = 200
|
| 11 |
+
cwd = os.getcwd()
|
| 12 |
+
pdf_file = os.path.join(cwd, "tests", "fixtures", "_pdf-uma-pagina.pdf")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class TestSplitters:
|
| 16 |
+
splitter = Splitter(chunk_size, chunk_overlap)
|
| 17 |
+
|
| 18 |
+
@pytest.mark.asyncio
|
| 19 |
+
async def test_load_and_split_document_No_llama_parse_No_Bubble(self, monkeypatch):
|
| 20 |
+
should_use_llama_parse = False
|
| 21 |
+
isBubble = False
|
| 22 |
+
|
| 23 |
+
result_chunks, result_strings = await self.splitter.load_and_split_document(
|
| 24 |
+
pdf_file, should_use_llama_parse, isBubble
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
assert isinstance(result_chunks, list)
|
| 28 |
+
assert isinstance(result_strings, list)
|
| 29 |
+
assert len(result_chunks) > 0
|
| 30 |
+
assert len(result_strings) > 0
|
| 31 |
+
assert all(isinstance(item, str) for item in result_strings)
|
| 32 |
+
assert all(isinstance(item, DocumentChunk) for item in result_chunks)
|
| 33 |
+
assert all(
|
| 34 |
+
(chunk_size - 100) < len(item.content) < (chunk_size + 100)
|
| 35 |
+
for item in result_chunks
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
@pytest.mark.asyncio
|
| 39 |
+
async def test_load_and_split_document_No_llama_parse_No_Bubble_with_bigger_chunk(
|
| 40 |
+
self, monkeypatch
|
| 41 |
+
):
|
| 42 |
+
should_use_llama_parse = False
|
| 43 |
+
isBubble = False
|
| 44 |
+
chunk_size = 3500
|
| 45 |
+
splitter_temp = Splitter(chunk_size, chunk_overlap)
|
| 46 |
+
|
| 47 |
+
result_chunks, result_strings = await splitter_temp.load_and_split_document(
|
| 48 |
+
pdf_file, should_use_llama_parse, isBubble
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
assert isinstance(result_chunks, list)
|
| 52 |
+
assert isinstance(result_strings, list)
|
| 53 |
+
assert len(result_chunks) > 0
|
| 54 |
+
assert len(result_strings) > 0
|
| 55 |
+
assert all(isinstance(item, str) for item in result_strings)
|
| 56 |
+
assert all(isinstance(item, DocumentChunk) for item in result_chunks)
|
| 57 |
+
assert all(
|
| 58 |
+
(chunk_size - 200) < len(item.content) < (chunk_size + 200)
|
| 59 |
+
for item in result_chunks
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
@pytest.mark.asyncio
|
| 63 |
+
async def test_load_and_split_document_With_llama_parse_No_Bubble(
|
| 64 |
+
self, monkeypatch
|
| 65 |
+
):
|
| 66 |
+
should_use_llama_parse = True
|
| 67 |
+
isBubble = False
|
| 68 |
+
result_chunks, result_strings = await self.splitter.load_and_split_document(
|
| 69 |
+
pdf_file, should_use_llama_parse, isBubble
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
assert isinstance(result_chunks, list)
|
| 73 |
+
assert isinstance(result_strings, list)
|
| 74 |
+
assert len(result_chunks) > 0
|
| 75 |
+
assert len(result_strings) > 0
|
| 76 |
+
assert all(isinstance(item, str) for item in result_strings)
|
| 77 |
+
assert all(isinstance(item, DocumentChunk) for item in result_chunks)
|
| 78 |
+
# Teste abaixo não passa ainda --> Será consertado no futuro
|
| 79 |
+
# assert all(
|
| 80 |
+
# (chunk_size - 100) < len(item.content) < (chunk_size + 100)
|
| 81 |
+
# for item in result_chunks
|
| 82 |
+
# )
|