fpadron commited on
Commit
741f393
·
1 Parent(s): 8aa9db9

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Exclude sensitive files
2
+ environment_var.py
3
+ # storage.sqlite
4
+ cache/
5
+ # qdrant/*
6
+ # !qdrant/.gitkeep
7
+ # Exclude compiled Python files
8
+ __pycache__/
9
+
10
+ # Exclude virtual environment
11
+ venv/
12
+
13
+ # Exclude lock and meta files
14
+ *.lock
15
+ meta.json
Dockerfile ADDED
File without changes
README.md CHANGED
@@ -1,10 +1 @@
1
- ---
2
- title: Api Space
3
- emoji: 📈
4
- colorFrom: blue
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Retrieval-augmented-generation-RAG
 
 
 
 
 
 
 
 
 
TestFolder/2201.01647v4.pdf ADDED
Binary file (791 kB). View file
 
TestFolder/subfolder/SeraphimdroidEmail.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OWASP Foundation Case # 00007032: OWASP Seraphimdroid
2
+
3
+ Hello,
4
+
5
+ I would like to present to you and invite you to review on your blog OWASP Seraphimdroid security application. This application is an Open Source effort to protect users from malware, theft, data leak and to protect their privacy. OWASP Seraphimdroid development was also financially supported during Google Summer of Code 2014 and OWASP Code Summer sprint 2015.
6
+
7
+ You may download app from Google play on the following link: https://play.google.com/store/apps/details?id=org.owasp.seraphimdroid
8
+
9
+ The main project website is: https://www.owasp.org/index.php/OWASP_SeraphimDroid_Project
10
+
11
+ In order to highligh some of the features I will count only fiew:
12
+ + Permission scanner. Permission scanner will show you the list of all installed application and the permission they are using. Also app will describe potential malicious use of certain permissions. Seraphimdroid is using machine learning in order to predict whether application might be malicious (be a virus, Trojan, worm, rootkit, etc) or not and will notify the user.
13
+ + Application and service locker. With OWASP Seraphimdroid, user may lock access to certain or to all of your applications and system services (WiFi, network, BlueTooth) with password
14
+ + Install lock. This feature can lock all installing and uninstalling action on your device. Great for parental control.
15
+ + Outgoing call and SMS blocker. This feature will allow user to perform normally outgoing calls and SMS, but it will block outgoing calls and inform about outgoing SMS performed by trojan applications.
16
+ + Geo-fencing. This feature allows user to set a location range where the device should be. If the device exits the range it may set up alarm or start sending messages to the defined number with its location.
17
+ + Remote location. If user lost your phone, he is able to send SMS with a defined secret code as a content and his phone and it will reply with the location coordinates of the device.
18
+ + Remote lock and lock
19
+
20
+ Please let me know if you have any questions.
21
+
22
+ Hope to hear from you soon.
api.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.staticfiles import StaticFiles
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig, AutoModelForQuestionAnswering
5
+ from langchain_community.llms import HuggingFacePipeline
6
+ from qdrant_client import QdrantClient
7
+ from langchain_qdrant import QdrantVectorStore
8
+ import os
9
+ from pydantic import BaseModel
10
+ from langchain.chains import RetrievalQA
11
+ from langchain.schema import Document
12
+ import time
13
+ import torch
14
+
15
+ model = None
16
+ tokenizer = None
17
+ dolly_pipeline_hf = None
18
+ embed_model = None
19
+ qdrant = None
20
+ model_name_hf = None
21
+ text_generation_pipeline = None
22
+ qa_pipeline = None
23
+
24
+ class Item(BaseModel):
25
+ query: str
26
+
27
+ app = FastAPI()
28
+
29
+ app.mount("/TestFolder", StaticFiles(directory="./TestFolder"), name="TestFolder")
30
+ os.makedirs("./cache", exist_ok=True)
31
+ os.makedirs("./offload", exist_ok=True)
32
+ os.makedirs("./models", exist_ok=True)
33
+
34
+ @app.on_event("startup")
35
+ async def startup_event():
36
+ global model, tokenizer, dolly_pipeline_hf, embed_model, qdrant, model_name_hf, text_generation_pipeline, qa_pipeline
37
+
38
+ print("🚀 Loading model....")
39
+
40
+ sentence_embedding_model_path = "sentence-transformers/paraphrase-MiniLM-L6-v2"
41
+ start_time = time.perf_counter()
42
+
43
+ embed_model = HuggingFaceEmbeddings(
44
+ model_name=sentence_embedding_model_path,
45
+ cache_folder="./models",
46
+ model_kwargs={"device": "cpu"},
47
+ encode_kwargs={"normalize_embeddings": True},
48
+ )
49
+
50
+ try:
51
+ qdrant_client = QdrantClient(path="qdrant/")
52
+ qdrant = QdrantVectorStore(qdrant_client, "MyCollection", embed_model, distance="Dot")
53
+ except Exception as e:
54
+ print(f"❌ Error initializing Qdrant: {e}")
55
+
56
+ model_path = "distilbert-base-cased-distilled-squad"
57
+ model = AutoModelForQuestionAnswering.from_pretrained(model_path, cache_dir="./models")
58
+ tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="./models")
59
+ qa_pipeline = pipeline(
60
+ "question-answering",
61
+ model=model,
62
+ tokenizer=tokenizer,
63
+ device=0 if torch.cuda.is_available() else -1
64
+ )
65
+
66
+ end_time = time.perf_counter()
67
+ print(f"✅ Dolly model loaded successfully in {end_time - start_time:.2f} seconds.")
68
+
69
+ app.on_event("shutdown")
70
+ async def shutdown_event():
71
+ global model, tokenizer, dolly_pipeline_hf
72
+ print("🚪 Shutting down the API and releasing model memory.")
73
+ del model, tokenizer, dolly_pipeline_hf, embed_model, qdrant, model_name_hf, text_generation_pipeline, qa_pipeline
74
+
75
+
76
+ @app.get("/")
77
+ def read_root():
78
+ return {"message": "Welcome to FastAPI"}
79
+
80
+ @app.post("/search")
81
+ def search(Item:Item):
82
+ print("Search endpoint")
83
+ query = Item.query
84
+
85
+ search_result = qdrant.similarity_search(
86
+ query=query, k=10
87
+ )
88
+ i = 0
89
+ list_res = []
90
+ for res in search_result:
91
+ list_res.append({"id":i,"path":res.metadata.get("path"),"content":res.page_content})
92
+
93
+
94
+ return list_res
95
+
96
+ @app.post("/ask_localai")
97
+ async def ask_localai(item: Item):
98
+ query = item.query
99
+
100
+ search_result = qdrant.similarity_search(query=query, k=3)
101
+ if not search_result:
102
+ return {"error": "No relevant results found for the query."}
103
+
104
+ context = " ".join([res.page_content for res in search_result])
105
+ if not context.strip():
106
+ return {"error": "No relevant context found."}
107
+
108
+ try:
109
+ prompt = (
110
+ f"Context: {context}\n\n"
111
+ f"Question: {query}\n"
112
+ f"Answer concisely and only based on the context provided. Do not repeat the context or the question.\n"
113
+ f"Answer:"
114
+ )
115
+ qa_result = qa_pipeline(question=query, context=context)
116
+ answer = qa_result["answer"]
117
+
118
+ return {
119
+ "question": query,
120
+ "answer": answer
121
+ }
122
+ except Exception as e:
123
+ return {"error": "Failed to generate an answer."}
124
+
125
+ @app.get("/items/{item_id}")
126
+ def read_item(item_id: int, q: str = None):
127
+ return {"item_id": item_id, "q": q}
128
+
129
+ @app.post("/items/")
130
+ def create_item(item: Item):
131
+ return {"item": item, "total_price": item.price + (item.tax or 0)}
index.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from os import listdir
3
+ from os.path import isfile, join,isdir
4
+
5
+ import torch
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain_qdrant import Qdrant
8
+ import sys
9
+ from langchain_text_splitters import TokenTextSplitter
10
+ from pptx import Presentation
11
+ from qdrant_client import QdrantClient
12
+ from qdrant_client.models import Distance, VectorParams
13
+ import docx
14
+ import os
15
+
16
+ def get_files(dir):
17
+ file_list = []
18
+ for dir, _, filenames in os.walk(dir):
19
+ for f in filenames:
20
+ file_list.append(os.path.join(dir, f))
21
+ return file_list
22
+
23
+ def getTextFromWord(filename):
24
+ doc = docx.Document(filename)
25
+ fullText = []
26
+ for para in doc.paragraphs:
27
+ fullText.append(para.text)
28
+ return '\n'.join(fullText)
29
+
30
+ def getTextFromPPTX(filename):
31
+ prs = Presentation(filename)
32
+ fullText = []
33
+ for slide in prs.slides:
34
+ for shape in slide.shapes:
35
+ fullText.append(shape.text)
36
+ return '\n'.join(fullText)
37
+
38
+ def main_indexing(mypath):
39
+ model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
40
+ if torch.cuda.is_available():
41
+ model_kwargs = {'device': 'cpu'}
42
+ elif torch.backends.mps.is_available():
43
+ model_kwargs = {'device': 'mps'}
44
+ else:
45
+ model_kwargs = {'device': 'cpu'}
46
+ encode_kwargs = {'normalize_embeddings': True}
47
+ hf = HuggingFaceEmbeddings(
48
+ model_name=model_name,
49
+ model_kwargs=model_kwargs,
50
+ encode_kwargs=encode_kwargs,
51
+ cache_folder="./models"
52
+ )
53
+ client = QdrantClient(path="qdrant/")
54
+ collection_name = "MyCollection"
55
+ if client.collection_exists(collection_name):
56
+ client.delete_collection(collection_name)
57
+
58
+ client.create_collection(collection_name,vectors_config=VectorParams(size=384, distance=Distance.DOT))
59
+ qdrant = Qdrant(client, collection_name, hf)
60
+ print("Indexing...")
61
+ onlyfiles = get_files(mypath)
62
+ file_content = ""
63
+ for file in onlyfiles:
64
+ file_content = ""
65
+ if file.find("~") > 0: # added by pdchristian to catch files with "~" in file name
66
+ file_content = "Empty due to ~ in file name." # added by pdchristian to catch files with "~" in file name
67
+ print("Document title with ~: " + file)
68
+ elif file.endswith(".pdf"):
69
+ try:
70
+ print("indexing "+file)
71
+ reader = PyPDF2.PdfReader(file)
72
+ for i in range(0,len(reader.pages)):
73
+ file_content = file_content + " "+reader.pages[i].extract_text()
74
+ except Exception as exc:# added by pdchristian to catch decryption error
75
+ file_content = "Empty due to extraction error." # added by pdchristian to catch decryption error
76
+ elif file.endswith(".txt") or file.endswith(".md") or file.endswith(".markdown"):
77
+ print("indexing " + file)
78
+ f = open(file,'r',encoding='utf-8',errors='ignore')
79
+ file_content = f.read()
80
+ f.close()
81
+ elif file.endswith(".docx"):
82
+ print("indexing " + file)
83
+ file_content = getTextFromWord(file)
84
+ elif file.endswith(".pptx"):
85
+ print("indexing " + file)
86
+ file_content = getTextFromPPTX(file)
87
+ else:
88
+ continue
89
+ text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=50)
90
+ texts = text_splitter.split_text(file_content)
91
+ metadata = []
92
+ for i in range(0,len(texts)):
93
+ metadata.append({"path":file})
94
+ qdrant.add_texts(texts,metadatas=metadata)
95
+ len(texts)
96
+ print(onlyfiles)
97
+ print("Finished indexing!")
98
+
99
+ if __name__ == "__main__":
100
+ arguments = sys.argv
101
+ if len(arguments)>1:
102
+ main_indexing(arguments[1])
103
+ else:
104
+ print("You need to provide a path to folder with documents to index as command line argument")
models/.gitkeep ADDED
File without changes
offload/.gitkeep ADDED
File without changes
qdrant/.gitkeep ADDED
File without changes
qdrant/collection/MyCollection/storage.sqlite ADDED
Binary file (295 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # torch
2
+ transformers
3
+ langchain
4
+ langgraph
5
+ qdrant-client
6
+ PyPDF2
7
+ tiktoken
8
+ langchain-text-splitters
9
+ sentence_transformers
10
+ langchain-qdrant
11
+ fastapi
12
+ gradio_client
13
+ pydantic
14
+ uvicorn
15
+ accelerate
16
+ streamlit
17
+ python-docx
18
+ python-pptx
19
+ openai
20
+ requests
21
+ nltk
22
+ langchain-huggingface
23
+ langchain-community
test_local.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+
4
+ url = "http://127.0.0.1:8000/ask_localai"
5
+
6
+ payload = json.dumps({
7
+ "query": "Is there a case number for OWASP?"
8
+ })
9
+ headers = {
10
+ 'Accept': 'application/json',
11
+ 'Content-Type': 'application/json'
12
+ }
13
+
14
+ response = requests.request("POST", url, headers=headers, data=payload)
15
+ print("-------------------")
16
+ print(response)
user_interface.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import streamlit as st
3
+ import requests
4
+ import json
5
+ import nltk
6
+ from nltk.corpus import stopwords
7
+
8
+ nltk.download('stopwords', quiet=True)
9
+ STOPWORDS = set(stopwords.words("english"))
10
+
11
+ pdf_url = "http://127.0.0.1:8000/TestFolder/2201.01647v4.pdf"
12
+
13
+ st.set_page_config(layout="wide")
14
+ st.markdown(
15
+ """
16
+ <style>
17
+ .main {
18
+ display: flex;
19
+ justify-content: center;
20
+ padding-top: 30px;
21
+ }
22
+ .content-container {
23
+ max-width: 1200px; /* Adjust width for centering */
24
+ width: 100%;
25
+ }
26
+ </style>
27
+ """,
28
+ unsafe_allow_html=True,
29
+ )
30
+
31
+ st.markdown('<div class="content-container">', unsafe_allow_html=True)
32
+ st.title('_:blue[Local File Search]_ :sunglasses:')
33
+
34
+ if "search_result" not in st.session_state:
35
+ st.session_state.search_result = []
36
+ if "ai_result" not in st.session_state:
37
+ st.session_state.ai_result = ""
38
+ if "search_input" not in st.session_state:
39
+ st.session_state.search_input = ""
40
+ if "ai_input" not in st.session_state:
41
+ st.session_state.ai_input = ""
42
+
43
+ def format_keywords_as_list(content, keywords, num_words=10):
44
+ filtered_keywords = [kw for kw in keywords if kw.lower() not in STOPWORDS]
45
+ escaped_keywords = "|".join(map(re.escape, filtered_keywords))
46
+
47
+ if not escaped_keywords:
48
+ return ["No relevant content found."]
49
+
50
+ matches = list(re.finditer(escaped_keywords, content, re.IGNORECASE))
51
+ if not matches:
52
+ return ["No matches found."]
53
+
54
+ snippets = []
55
+ for match in matches:
56
+ start_index = match.start()
57
+ words_before = content[:start_index].split()[-10:]
58
+ words_after = content[start_index:].split()[:num_words + 1]
59
+ snippet = " ".join(words_before + words_after)
60
+
61
+ highlighted_snippet = re.sub(
62
+ escaped_keywords,
63
+ lambda m: f"<span style='background-color: yellow; font-weight: bold;'>{m.group(0)}</span>",
64
+ snippet,
65
+ flags=re.IGNORECASE,
66
+ )
67
+ snippets.append(f"... {highlighted_snippet} ...")
68
+
69
+ return snippets
70
+
71
+ left_col, right_col = st.columns([1, 1])
72
+ with left_col:
73
+ st.subheader("Search Files")
74
+ search_input = st.text_input("Enter keywords to search your local files:", st.session_state.search_input, key="search_input_key")
75
+ if st.button("Search files"):
76
+ st.session_state.search_input = search_input
77
+ url = "http://127.0.0.1:8000/search"
78
+
79
+ payload = json.dumps({"query": search_input})
80
+ headers = {
81
+ 'Accept': 'application/json',
82
+ 'Content-Type': 'application/json'
83
+ }
84
+
85
+ try:
86
+ response = requests.post(url, headers=headers, data=payload)
87
+ response.raise_for_status()
88
+ response_data = response.json()
89
+
90
+ if isinstance(response_data, list):
91
+ st.session_state.search_result = response_data
92
+ else:
93
+ st.session_state.search_result = [{"content": "Unexpected data format received.", "path": ""}]
94
+
95
+ except requests.exceptions.RequestException as e:
96
+ st.session_state.search_result = [{"content": f"HTTP Request failed: {e}", "path": ""}]
97
+ except json.JSONDecodeError:
98
+ st.session_state.search_result = [{"content": "Failed to decode JSON response.", "path": ""}]
99
+
100
+ if st.session_state.search_result:
101
+ st.write("### Results:")
102
+ for item in st.session_state.search_result:
103
+ keywords = st.session_state.search_input.split()
104
+ snippets = format_keywords_as_list(item.get('content', ""), keywords)
105
+
106
+ valid_snippets = [snippet for snippet in snippets if snippet != "No matches found."]
107
+ if valid_snippets:
108
+ st.markdown(f"<span style='font-size:20px; font-weight:bold;'>Document: <a href='{pdf_url}' target='_blank' style='text-decoration: none; color: blue;'>{item.get('path', 'Unknown File')}</a></span>",
109
+ unsafe_allow_html=True)
110
+ for snippet in valid_snippets:
111
+ st.markdown(f"- {snippet}", unsafe_allow_html=True)
112
+
113
+ with right_col:
114
+ st.subheader("Ask LocalAI")
115
+ ai_input = st.text_input("Enter your question for LocalAI:", st.session_state.ai_input, key="ai_input_key")
116
+ if st.button("Ask LocalAI"):
117
+ st.session_state.ai_input = ai_input
118
+ url = "http://127.0.0.1:8000/ask_localai"
119
+
120
+ payload = json.dumps({"query": ai_input})
121
+ headers = {
122
+ 'Accept': 'application/json',
123
+ 'Content-Type': 'application/json'
124
+ }
125
+
126
+ try:
127
+ response = requests.post(url, headers=headers, data=payload)
128
+ response.raise_for_status()
129
+ response_data = response.json()
130
+
131
+ if "answer" in response_data:
132
+ query = response_data.get("question", "No question provided.")
133
+ answer = response_data.get("answer", "No answer provided.")
134
+ st.session_state.ai_result = f"### Question:\n{query}\n\n### Answer:\n{answer}"
135
+ else:
136
+ st.session_state.ai_result = "No 'answer' field found in the response."
137
+
138
+ except requests.exceptions.RequestException as e:
139
+ st.session_state.ai_result = f"HTTP Request failed: {e}"
140
+ except json.JSONDecodeError:
141
+ st.session_state.ai_result = "Failed to decode JSON response.."
142
+
143
+ if st.session_state.ai_result:
144
+ st.write(st.session_state.ai_result)
145
+
146
+ st.markdown(
147
+ f"<span style='font-size:16px;'>This AI model is trained from the following document: <a href='{pdf_url}' target='_blank' style='color: blue;'>View PDF</a></span>",
148
+ unsafe_allow_html=True,
149
+ )
150
+
151
+ st.markdown('</div>', unsafe_allow_html=True)
uvicorn_start.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ import uvicorn
3
+
4
+
5
+ if __name__=="__main__":
6
+ uvicorn.run("api:app",host='127.0.0.1', port=8000, reload=True, workers=3)