Spaces:
Sleeping
Sleeping
Removed Hugging Face pipeline dependency and implemented direct model loading for DeepSeek-V3 using AutoTokenizer and AutoModelForCausalLM. Improved fallback robustness and error handling for model operations.
Browse files
app.py
CHANGED
@@ -1,12 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
-
|
4 |
-
|
5 |
-
USE_PIPELINE = True
|
6 |
-
except ImportError:
|
7 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
8 |
-
import torch
|
9 |
-
USE_PIPELINE = False
|
10 |
|
11 |
##############################################################################
|
12 |
# LOAD MODELS
|
@@ -14,59 +9,33 @@ except ImportError:
|
|
14 |
|
15 |
@st.cache_resource
|
16 |
def load_model_engineer():
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
else:
|
30 |
-
try:
|
31 |
-
# Fallback: Load model directly with fp16 precision
|
32 |
-
tokenizer = AutoTokenizer.from_pretrained("unsloth/DeepSeek-V3", trust_remote_code=True)
|
33 |
-
model = AutoModelForCausalLM.from_pretrained(
|
34 |
-
"unsloth/DeepSeek-V3",
|
35 |
-
trust_remote_code=True
|
36 |
-
)
|
37 |
-
model = model.half().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
38 |
-
return tokenizer, model
|
39 |
-
except Exception as e:
|
40 |
-
st.error(f"Direct model loading failed for Engineer: {str(e)}")
|
41 |
-
raise
|
42 |
|
43 |
@st.cache_resource
|
44 |
def load_model_analyst():
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
else:
|
58 |
-
try:
|
59 |
-
# Fallback: Load model directly with fp16 precision
|
60 |
-
tokenizer = AutoTokenizer.from_pretrained("unsloth/DeepSeek-V3", trust_remote_code=True)
|
61 |
-
model = AutoModelForCausalLM.from_pretrained(
|
62 |
-
"unsloth/DeepSeek-V3",
|
63 |
-
trust_remote_code=True
|
64 |
-
)
|
65 |
-
model = model.half().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
66 |
-
return tokenizer, model
|
67 |
-
except Exception as e:
|
68 |
-
st.error(f"Direct model loading failed for Analyst: {str(e)}")
|
69 |
-
raise
|
70 |
|
71 |
# Load models
|
72 |
try:
|
@@ -84,20 +53,16 @@ def generate_response(prompt, model, max_sentences=2):
|
|
84 |
Generate a concise response based on the provided prompt.
|
85 |
"""
|
86 |
try:
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
top_p=0.8,
|
98 |
-
pad_token_id=tokenizer.pad_token_id
|
99 |
-
)
|
100 |
-
response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
|
101 |
# Limit to max_sentences by splitting and rejoining
|
102 |
return " ".join(response.split(".")[:max_sentences]) + "."
|
103 |
except Exception as gen_error:
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
##############################################################################
|
7 |
# LOAD MODELS
|
|
|
9 |
|
10 |
@st.cache_resource
|
11 |
def load_model_engineer():
|
12 |
+
try:
|
13 |
+
# Engineer: DeepSeek-V3 loaded directly
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained("unsloth/DeepSeek-V3", trust_remote_code=True)
|
15 |
+
model = AutoModelForCausalLM.from_pretrained(
|
16 |
+
"unsloth/DeepSeek-V3",
|
17 |
+
trust_remote_code=True
|
18 |
+
)
|
19 |
+
model = model.half().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
20 |
+
return tokenizer, model
|
21 |
+
except Exception as e:
|
22 |
+
st.error(f"Direct model loading failed for Engineer: {str(e)}")
|
23 |
+
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
@st.cache_resource
|
26 |
def load_model_analyst():
|
27 |
+
try:
|
28 |
+
# Analyst: DeepSeek-V3 loaded directly
|
29 |
+
tokenizer = AutoTokenizer.from_pretrained("unsloth/DeepSeek-V3", trust_remote_code=True)
|
30 |
+
model = AutoModelForCausalLM.from_pretrained(
|
31 |
+
"unsloth/DeepSeek-V3",
|
32 |
+
trust_remote_code=True
|
33 |
+
)
|
34 |
+
model = model.half().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
35 |
+
return tokenizer, model
|
36 |
+
except Exception as e:
|
37 |
+
st.error(f"Direct model loading failed for Analyst: {str(e)}")
|
38 |
+
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# Load models
|
41 |
try:
|
|
|
53 |
Generate a concise response based on the provided prompt.
|
54 |
"""
|
55 |
try:
|
56 |
+
tokenizer, model = model
|
57 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
58 |
+
outputs = model.generate(
|
59 |
+
inputs["input_ids"],
|
60 |
+
max_new_tokens=50,
|
61 |
+
temperature=0.6,
|
62 |
+
top_p=0.8,
|
63 |
+
pad_token_id=tokenizer.pad_token_id
|
64 |
+
)
|
65 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
|
|
|
|
|
|
|
|
|
66 |
# Limit to max_sentences by splitting and rejoining
|
67 |
return " ".join(response.split(".")[:max_sentences]) + "."
|
68 |
except Exception as gen_error:
|