File size: 4,259 Bytes
741f393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b73bad
 
 
 
 
 
 
 
 
 
741f393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b73bad
741f393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import PyPDF2
from os import listdir
from os.path import isfile, join,isdir

import torch
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_qdrant import Qdrant
import sys
from langchain_text_splitters import TokenTextSplitter
from pptx import Presentation
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import docx
import os

# Set a writable directory for Hugging Face cache and environment variables
hf_cache_dir = "/tmp/huggingface_cache"
transformers_cache_dir = os.path.join(hf_cache_dir, "transformers")
os.environ["HF_HOME"] = hf_cache_dir
os.environ["TRANSFORMERS_CACHE"] = transformers_cache_dir

# Ensure the writable directories exist
os.makedirs(hf_cache_dir, exist_ok=True)
os.makedirs(transformers_cache_dir, exist_ok=True)

def get_files(dir):
    file_list = []
    for dir, _, filenames in os.walk(dir):
        for f in filenames:
            file_list.append(os.path.join(dir, f))
    return file_list

def getTextFromWord(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

def getTextFromPPTX(filename):
    prs = Presentation(filename)
    fullText = []
    for slide in prs.slides:
        for shape in slide.shapes:
            fullText.append(shape.text)
    return '\n'.join(fullText)

def main_indexing(mypath):
    model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
    if torch.cuda.is_available():
        model_kwargs = {'device': 'cpu'}
    elif torch.backends.mps.is_available():
        model_kwargs = {'device': 'mps'}
    else:
        model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': True}
    hf = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
         cache_folder=hf_cache_dir,
    )
    client = QdrantClient(path="qdrant/")
    collection_name = "MyCollection"
    if client.collection_exists(collection_name):
        client.delete_collection(collection_name)

    client.create_collection(collection_name,vectors_config=VectorParams(size=384, distance=Distance.DOT))
    qdrant = Qdrant(client, collection_name, hf)
    print("Indexing...")
    onlyfiles = get_files(mypath)
    file_content = ""
    for file in onlyfiles:
        file_content = ""
        if file.find("~") > 0:  # added by pdchristian to catch files with "~" in file name
            file_content = "Empty due to ~ in file name."  # added by pdchristian to catch files with "~" in file name
            print("Document title with ~: " + file)
        elif file.endswith(".pdf"):
            try:
                print("indexing "+file)
                reader = PyPDF2.PdfReader(file)
                for i in range(0,len(reader.pages)):
                    file_content = file_content + " "+reader.pages[i].extract_text()
            except Exception as exc:# added by pdchristian to catch decryption error
                file_content = "Empty due to extraction error."  # added by pdchristian to catch decryption error
        elif file.endswith(".txt") or file.endswith(".md") or file.endswith(".markdown"):
            print("indexing " + file)
            f = open(file,'r',encoding='utf-8',errors='ignore')
            file_content = f.read()
            f.close()
        elif file.endswith(".docx"):
            print("indexing " + file)
            file_content = getTextFromWord(file)
        elif file.endswith(".pptx"):
            print("indexing " + file)
            file_content = getTextFromPPTX(file)
        else:
            continue
        text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=50)
        texts = text_splitter.split_text(file_content)
        metadata = []
        for i in range(0,len(texts)):
            metadata.append({"path":file})
        qdrant.add_texts(texts,metadatas=metadata)
        len(texts)
    print(onlyfiles)
    print("Finished indexing!")

if __name__ == "__main__":
    arguments = sys.argv
    if len(arguments)>1:
        main_indexing(arguments[1])
    else:
        print("You need to provide a path to folder with documents to index as command line argument")