Spaces:

fpadron
/

api-space

Sleeping

App Files Files Community

fpadron commited on Dec 13, 2024

Commit

741f393

1 Parent(s): 8aa9db9

initial commit

Browse files

Files changed (15) hide show

.gitignore +15 -0
Dockerfile +0 -0
README.md +1 -10
TestFolder/2201.01647v4.pdf +0 -0
TestFolder/subfolder/SeraphimdroidEmail.txt +22 -0
api.py +131 -0
index.py +104 -0
models/.gitkeep +0 -0
offload/.gitkeep +0 -0
qdrant/.gitkeep +0 -0
qdrant/collection/MyCollection/storage.sqlite +0 -0
requirements.txt +23 -0
test_local.py +16 -0
user_interface.py +151 -0
uvicorn_start.py +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+# Exclude sensitive files
+environment_var.py
+# storage.sqlite
+cache/
+# qdrant/*
+# !qdrant/.gitkeep
+# Exclude compiled Python files
+__pycache__/
+# Exclude virtual environment
+venv/
+# Exclude lock and meta files
+*.lock
+meta.json

Dockerfile ADDED Viewed

File without changes

README.md CHANGED Viewed

@@ -1,10 +1 @@
----
-title: Api Space
-emoji: 📈
-colorFrom: blue
-colorTo: pink
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # Retrieval-augmented-generation-RAG

TestFolder/2201.01647v4.pdf ADDED Viewed

Binary file (791 kB). View file

TestFolder/subfolder/SeraphimdroidEmail.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+OWASP Foundation Case # 00007032: OWASP Seraphimdroid
+Hello,
+I would like to present to you and invite you to review on your blog OWASP Seraphimdroid security application. This application is an Open Source effort to protect users from malware, theft, data leak and to protect their privacy. OWASP Seraphimdroid development was also financially supported during Google Summer of Code 2014 and OWASP Code Summer sprint 2015.
+You may download app from Google play on the following link: https://play.google.com/store/apps/details?id=org.owasp.seraphimdroid
+The main project website is: https://www.owasp.org/index.php/OWASP_SeraphimDroid_Project
+In order to highligh some of the features I will count only fiew:
++ Permission scanner. Permission scanner will show you the list of all installed application and the permission they are using. Also app will describe potential malicious use of certain permissions. Seraphimdroid is using machine learning in order to predict whether application might be malicious (be a virus, Trojan, worm, rootkit, etc) or not and will notify the user.
++ Application and service locker. With OWASP Seraphimdroid, user may lock access to certain or to all of your applications and system services (WiFi, network, BlueTooth) with password
++ Install lock. This feature can lock all installing and uninstalling action on your device. Great for parental control.
++ Outgoing call and SMS blocker. This feature will allow user to perform normally outgoing calls and SMS, but it will block outgoing calls and inform about outgoing SMS performed by trojan applications.
++ Geo-fencing. This feature allows user to set a location range where the device should be. If the device exits the range it may set up alarm or start sending messages to the defined number with its location.
++ Remote location. If user lost your phone, he is able to send SMS with a defined secret code as a content and his phone and it will reply with the location coordinates of the device.
++ Remote lock and lock
+Please let me know if you have any questions.
+Hope to hear from you soon.

api.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
+from langchain_huggingface import HuggingFaceEmbeddings
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig, AutoModelForQuestionAnswering
+from langchain_community.llms import HuggingFacePipeline
+from qdrant_client import QdrantClient
+from langchain_qdrant import QdrantVectorStore
+import os
+from pydantic import BaseModel
+from langchain.chains import RetrievalQA
+from langchain.schema import Document
+import time
+import torch
+model = None
+tokenizer = None
+dolly_pipeline_hf = None
+embed_model = None
+qdrant = None
+model_name_hf = None
+text_generation_pipeline = None
+qa_pipeline = None
+class Item(BaseModel):
+    query: str
+app = FastAPI()
+app.mount("/TestFolder", StaticFiles(directory="./TestFolder"), name="TestFolder")
+os.makedirs("./cache", exist_ok=True)
+os.makedirs("./offload", exist_ok=True)
+os.makedirs("./models", exist_ok=True)
+@app.on_event("startup")
+async def startup_event():
+    global model, tokenizer, dolly_pipeline_hf, embed_model, qdrant, model_name_hf, text_generation_pipeline, qa_pipeline
+    print("🚀 Loading model....")
+    sentence_embedding_model_path = "sentence-transformers/paraphrase-MiniLM-L6-v2"
+    start_time = time.perf_counter()
+    embed_model = HuggingFaceEmbeddings(
+    model_name=sentence_embedding_model_path,
+    cache_folder="./models",
+    model_kwargs={"device": "cpu"},
+    encode_kwargs={"normalize_embeddings": True},
+    )
+    try:
+        qdrant_client = QdrantClient(path="qdrant/")
+        qdrant = QdrantVectorStore(qdrant_client, "MyCollection", embed_model, distance="Dot")
+    except Exception as e:
+        print(f"❌ Error initializing Qdrant: {e}")
+    model_path = "distilbert-base-cased-distilled-squad"
+    model = AutoModelForQuestionAnswering.from_pretrained(model_path, cache_dir="./models")
+    tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="./models")
+    qa_pipeline = pipeline(
+        "question-answering",
+        model=model,
+        tokenizer=tokenizer,
+        device=0 if torch.cuda.is_available() else -1
+    )
+    end_time = time.perf_counter()
+    print(f"✅ Dolly model loaded successfully in {end_time - start_time:.2f} seconds.")
+app.on_event("shutdown")
+async def shutdown_event():
+    global model, tokenizer, dolly_pipeline_hf
+    print("🚪 Shutting down the API and releasing model memory.")
+    del model, tokenizer, dolly_pipeline_hf, embed_model, qdrant, model_name_hf, text_generation_pipeline, qa_pipeline
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to FastAPI"}
+@app.post("/search")
+def search(Item:Item):
+    print("Search endpoint")
+    query = Item.query
+    search_result = qdrant.similarity_search(
+        query=query, k=10
+    )
+    i = 0
+    list_res = []
+    for res in search_result:
+        list_res.append({"id":i,"path":res.metadata.get("path"),"content":res.page_content})
+    return list_res
+@app.post("/ask_localai")
+async def ask_localai(item: Item):
+    query = item.query
+    search_result = qdrant.similarity_search(query=query, k=3)
+    if not search_result:
+        return {"error": "No relevant results found for the query."}
+    context = " ".join([res.page_content for res in search_result])
+    if not context.strip():
+        return {"error": "No relevant context found."}
+    try:
+        prompt = (
+            f"Context: {context}\n\n"
+            f"Question: {query}\n"
+            f"Answer concisely and only based on the context provided. Do not repeat the context or the question.\n"
+            f"Answer:"
+        )
+        qa_result = qa_pipeline(question=query, context=context)
+        answer = qa_result["answer"]
+        return {
+            "question": query,
+            "answer": answer
+        }
+    except Exception as e:
+        return {"error": "Failed to generate an answer."}
+@app.get("/items/{item_id}")
+def read_item(item_id: int, q: str = None):
+    return {"item_id": item_id, "q": q}
+@app.post("/items/")
+def create_item(item: Item):
+    return {"item": item, "total_price": item.price + (item.tax or 0)}

index.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import PyPDF2
+from os import listdir
+from os.path import isfile, join,isdir
+import torch
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_qdrant import Qdrant
+import sys
+from langchain_text_splitters import TokenTextSplitter
+from pptx import Presentation
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, VectorParams
+import docx
+import os
+def get_files(dir):
+    file_list = []
+    for dir, _, filenames in os.walk(dir):
+        for f in filenames:
+            file_list.append(os.path.join(dir, f))
+    return file_list
+def getTextFromWord(filename):
+    doc = docx.Document(filename)
+    fullText = []
+    for para in doc.paragraphs:
+        fullText.append(para.text)
+    return '\n'.join(fullText)
+def getTextFromPPTX(filename):
+    prs = Presentation(filename)
+    fullText = []
+    for slide in prs.slides:
+        for shape in slide.shapes:
+            fullText.append(shape.text)
+    return '\n'.join(fullText)
+def main_indexing(mypath):
+    model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
+    if torch.cuda.is_available():
+        model_kwargs = {'device': 'cpu'}
+    elif torch.backends.mps.is_available():
+        model_kwargs = {'device': 'mps'}
+    else:
+        model_kwargs = {'device': 'cpu'}
+    encode_kwargs = {'normalize_embeddings': True}
+    hf = HuggingFaceEmbeddings(
+        model_name=model_name,
+        model_kwargs=model_kwargs,
+        encode_kwargs=encode_kwargs,
+        cache_folder="./models"
+    )
+    client = QdrantClient(path="qdrant/")
+    collection_name = "MyCollection"
+    if client.collection_exists(collection_name):
+        client.delete_collection(collection_name)
+    client.create_collection(collection_name,vectors_config=VectorParams(size=384, distance=Distance.DOT))
+    qdrant = Qdrant(client, collection_name, hf)
+    print("Indexing...")
+    onlyfiles = get_files(mypath)
+    file_content = ""
+    for file in onlyfiles:
+        file_content = ""
+        if file.find("~") > 0:  # added by pdchristian to catch files with "~" in file name
+            file_content = "Empty due to ~ in file name."  # added by pdchristian to catch files with "~" in file name
+            print("Document title with ~: " + file)
+        elif file.endswith(".pdf"):
+            try:
+                print("indexing "+file)
+                reader = PyPDF2.PdfReader(file)
+                for i in range(0,len(reader.pages)):
+                    file_content = file_content + " "+reader.pages[i].extract_text()
+            except Exception as exc:# added by pdchristian to catch decryption error
+                file_content = "Empty due to extraction error."  # added by pdchristian to catch decryption error
+        elif file.endswith(".txt") or file.endswith(".md") or file.endswith(".markdown"):
+            print("indexing " + file)
+            f = open(file,'r',encoding='utf-8',errors='ignore')
+            file_content = f.read()
+            f.close()
+        elif file.endswith(".docx"):
+            print("indexing " + file)
+            file_content = getTextFromWord(file)
+        elif file.endswith(".pptx"):
+            print("indexing " + file)
+            file_content = getTextFromPPTX(file)
+        else:
+            continue
+        text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=50)
+        texts = text_splitter.split_text(file_content)
+        metadata = []
+        for i in range(0,len(texts)):
+            metadata.append({"path":file})
+        qdrant.add_texts(texts,metadatas=metadata)
+        len(texts)
+    print(onlyfiles)
+    print("Finished indexing!")
+if __name__ == "__main__":
+    arguments = sys.argv
+    if len(arguments)>1:
+        main_indexing(arguments[1])
+    else:
+        print("You need to provide a path to folder with documents to index as command line argument")

models/.gitkeep ADDED Viewed

File without changes

offload/.gitkeep ADDED Viewed

File without changes

qdrant/.gitkeep ADDED Viewed

File without changes

qdrant/collection/MyCollection/storage.sqlite ADDED Viewed

Binary file (295 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+# torch
+transformers
+langchain
+langgraph
+qdrant-client
+PyPDF2
+tiktoken
+langchain-text-splitters
+sentence_transformers
+langchain-qdrant
+fastapi
+gradio_client
+pydantic
+uvicorn
+accelerate
+streamlit
+python-docx
+python-pptx
+openai
+requests
+nltk
+langchain-huggingface
+langchain-community

test_local.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import requests
+import json
+url = "http://127.0.0.1:8000/ask_localai"
+payload = json.dumps({
+  "query": "Is there a case number for OWASP?"
+})
+headers = {
+  'Accept': 'application/json',
+  'Content-Type': 'application/json'
+}
+response = requests.request("POST", url, headers=headers, data=payload)
+print("-------------------")
+print(response)

user_interface.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import re
+import streamlit as st
+import requests
+import json
+import nltk
+from nltk.corpus import stopwords
+nltk.download('stopwords', quiet=True)
+STOPWORDS = set(stopwords.words("english"))
+pdf_url = "http://127.0.0.1:8000/TestFolder/2201.01647v4.pdf"
+st.set_page_config(layout="wide")
+st.markdown(
+    """
+    <style>
+    .main {
+        display: flex;
+        justify-content: center;
+        padding-top: 30px;
+    }
+    .content-container {
+        max-width: 1200px; /* Adjust width for centering */
+        width: 100%;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+st.markdown('<div class="content-container">', unsafe_allow_html=True)
+st.title('_:blue[Local File Search]_ :sunglasses:')
+if "search_result" not in st.session_state:
+    st.session_state.search_result = []
+if "ai_result" not in st.session_state:
+    st.session_state.ai_result = ""
+if "search_input" not in st.session_state:
+    st.session_state.search_input = ""
+if "ai_input" not in st.session_state:
+    st.session_state.ai_input = ""
+def format_keywords_as_list(content, keywords, num_words=10):
+    filtered_keywords = [kw for kw in keywords if kw.lower() not in STOPWORDS]
+    escaped_keywords = "|".join(map(re.escape, filtered_keywords))
+    if not escaped_keywords:
+        return ["No relevant content found."]
+    matches = list(re.finditer(escaped_keywords, content, re.IGNORECASE))
+    if not matches:
+        return ["No matches found."]
+    snippets = []
+    for match in matches:
+        start_index = match.start()
+        words_before = content[:start_index].split()[-10:]
+        words_after = content[start_index:].split()[:num_words + 1]
+        snippet = " ".join(words_before + words_after)
+        highlighted_snippet = re.sub(
+            escaped_keywords,
+            lambda m: f"<span style='background-color: yellow; font-weight: bold;'>{m.group(0)}</span>",
+            snippet,
+            flags=re.IGNORECASE,
+        )
+        snippets.append(f"... {highlighted_snippet} ...")
+    return snippets
+left_col, right_col = st.columns([1, 1])
+with left_col:
+    st.subheader("Search Files")
+    search_input = st.text_input("Enter keywords to search your local files:", st.session_state.search_input, key="search_input_key")
+    if st.button("Search files"):
+        st.session_state.search_input = search_input
+        url = "http://127.0.0.1:8000/search"
+        payload = json.dumps({"query": search_input})
+        headers = {
+            'Accept': 'application/json',
+            'Content-Type': 'application/json'
+        }
+        try:
+            response = requests.post(url, headers=headers, data=payload)
+            response.raise_for_status()
+            response_data = response.json()
+            if isinstance(response_data, list):
+                st.session_state.search_result = response_data
+            else:
+                st.session_state.search_result = [{"content": "Unexpected data format received.", "path": ""}]
+        except requests.exceptions.RequestException as e:
+            st.session_state.search_result = [{"content": f"HTTP Request failed: {e}", "path": ""}]
+        except json.JSONDecodeError:
+            st.session_state.search_result = [{"content": "Failed to decode JSON response.", "path": ""}]
+    if st.session_state.search_result:
+        st.write("### Results:")
+        for item in st.session_state.search_result:
+            keywords = st.session_state.search_input.split()
+            snippets = format_keywords_as_list(item.get('content', ""), keywords)
+            valid_snippets = [snippet for snippet in snippets if snippet != "No matches found."]
+            if valid_snippets:
+                st.markdown(f"<span style='font-size:20px; font-weight:bold;'>Document: <a href='{pdf_url}' target='_blank' style='text-decoration: none; color: blue;'>{item.get('path', 'Unknown File')}</a></span>",
+                            unsafe_allow_html=True)
+                for snippet in valid_snippets:
+                    st.markdown(f"- {snippet}", unsafe_allow_html=True)
+with right_col:
+    st.subheader("Ask LocalAI")
+    ai_input = st.text_input("Enter your question for LocalAI:", st.session_state.ai_input, key="ai_input_key")
+    if st.button("Ask LocalAI"):
+        st.session_state.ai_input = ai_input
+        url = "http://127.0.0.1:8000/ask_localai"
+        payload = json.dumps({"query": ai_input})
+        headers = {
+            'Accept': 'application/json',
+            'Content-Type': 'application/json'
+        }
+        try:
+            response = requests.post(url, headers=headers, data=payload)
+            response.raise_for_status()
+            response_data = response.json()
+            if "answer" in response_data:
+                query = response_data.get("question", "No question provided.")
+                answer = response_data.get("answer", "No answer provided.")
+                st.session_state.ai_result = f"### Question:\n{query}\n\n### Answer:\n{answer}"
+            else:
+                st.session_state.ai_result = "No 'answer' field found in the response."
+        except requests.exceptions.RequestException as e:
+            st.session_state.ai_result = f"HTTP Request failed: {e}"
+        except json.JSONDecodeError:
+            st.session_state.ai_result = "Failed to decode JSON response.."
+    if st.session_state.ai_result:
+        st.write(st.session_state.ai_result)
+    st.markdown(
+        f"<span style='font-size:16px;'>This AI model is trained from the following document: <a href='{pdf_url}' target='_blank' style='color: blue;'>View PDF</a></span>",
+        unsafe_allow_html=True,
+    )
+st.markdown('</div>', unsafe_allow_html=True)

uvicorn_start.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import uvicorn
+if __name__=="__main__":
+    uvicorn.run("api:app",host='127.0.0.1', port=8000, reload=True,  workers=3)