File size: 4,259 Bytes
741f393 6b73bad 741f393 6b73bad 741f393 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import PyPDF2
from os import listdir
from os.path import isfile, join,isdir
import torch
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_qdrant import Qdrant
import sys
from langchain_text_splitters import TokenTextSplitter
from pptx import Presentation
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import docx
import os
# Set a writable directory for Hugging Face cache and environment variables
hf_cache_dir = "/tmp/huggingface_cache"
transformers_cache_dir = os.path.join(hf_cache_dir, "transformers")
os.environ["HF_HOME"] = hf_cache_dir
os.environ["TRANSFORMERS_CACHE"] = transformers_cache_dir
# Ensure the writable directories exist
os.makedirs(hf_cache_dir, exist_ok=True)
os.makedirs(transformers_cache_dir, exist_ok=True)
def get_files(dir):
file_list = []
for dir, _, filenames in os.walk(dir):
for f in filenames:
file_list.append(os.path.join(dir, f))
return file_list
def getTextFromWord(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
def getTextFromPPTX(filename):
prs = Presentation(filename)
fullText = []
for slide in prs.slides:
for shape in slide.shapes:
fullText.append(shape.text)
return '\n'.join(fullText)
def main_indexing(mypath):
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
if torch.cuda.is_available():
model_kwargs = {'device': 'cpu'}
elif torch.backends.mps.is_available():
model_kwargs = {'device': 'mps'}
else:
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
cache_folder=hf_cache_dir,
)
client = QdrantClient(path="qdrant/")
collection_name = "MyCollection"
if client.collection_exists(collection_name):
client.delete_collection(collection_name)
client.create_collection(collection_name,vectors_config=VectorParams(size=384, distance=Distance.DOT))
qdrant = Qdrant(client, collection_name, hf)
print("Indexing...")
onlyfiles = get_files(mypath)
file_content = ""
for file in onlyfiles:
file_content = ""
if file.find("~") > 0: # added by pdchristian to catch files with "~" in file name
file_content = "Empty due to ~ in file name." # added by pdchristian to catch files with "~" in file name
print("Document title with ~: " + file)
elif file.endswith(".pdf"):
try:
print("indexing "+file)
reader = PyPDF2.PdfReader(file)
for i in range(0,len(reader.pages)):
file_content = file_content + " "+reader.pages[i].extract_text()
except Exception as exc:# added by pdchristian to catch decryption error
file_content = "Empty due to extraction error." # added by pdchristian to catch decryption error
elif file.endswith(".txt") or file.endswith(".md") or file.endswith(".markdown"):
print("indexing " + file)
f = open(file,'r',encoding='utf-8',errors='ignore')
file_content = f.read()
f.close()
elif file.endswith(".docx"):
print("indexing " + file)
file_content = getTextFromWord(file)
elif file.endswith(".pptx"):
print("indexing " + file)
file_content = getTextFromPPTX(file)
else:
continue
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_text(file_content)
metadata = []
for i in range(0,len(texts)):
metadata.append({"path":file})
qdrant.add_texts(texts,metadatas=metadata)
len(texts)
print(onlyfiles)
print("Finished indexing!")
if __name__ == "__main__":
arguments = sys.argv
if len(arguments)>1:
main_indexing(arguments[1])
else:
print("You need to provide a path to folder with documents to index as command line argument") |