Spaces:
Sleeping
Sleeping
ryoshimu
commited on
Commit
·
6246717
1
Parent(s):
760e16a
commit
Browse files
app.py
CHANGED
@@ -3,6 +3,7 @@ import torch
|
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
import os
|
5 |
from typing import List, Tuple
|
|
|
6 |
|
7 |
# Hugging Face token from environment variable
|
8 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
@@ -12,50 +13,54 @@ class ChatBot:
|
|
12 |
self.model = None
|
13 |
self.tokenizer = None
|
14 |
self.current_model = None
|
15 |
-
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
|
17 |
def load_model(self, model_name: str):
|
18 |
"""モデルとトークナイザーをロード"""
|
19 |
-
if self.current_model == model_name:
|
20 |
return
|
21 |
|
22 |
# メモリクリア
|
23 |
if self.model is not None:
|
24 |
del self.model
|
25 |
-
del self.tokenizer
|
26 |
torch.cuda.empty_cache()
|
27 |
|
28 |
-
#
|
29 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
30 |
model_name,
|
31 |
use_auth_token=HF_TOKEN,
|
32 |
trust_remote_code=True
|
33 |
)
|
34 |
|
|
|
|
|
|
|
|
|
|
|
35 |
self.model = AutoModelForCausalLM.from_pretrained(
|
36 |
model_name,
|
37 |
use_auth_token=HF_TOKEN,
|
38 |
-
torch_dtype=torch.float16
|
39 |
-
|
40 |
trust_remote_code=True
|
41 |
)
|
42 |
-
|
43 |
-
if self.device == "cuda":
|
44 |
-
self.model = self.model.to(self.device)
|
45 |
|
46 |
self.current_model = model_name
|
47 |
|
|
|
48 |
def generate_response(self, message: str, history: List[Tuple[str, str]], model_name: str,
|
49 |
temperature: float = 0.7, max_tokens: int = 512) -> str:
|
50 |
"""メッセージに対する応答を生成"""
|
51 |
# モデルロード
|
52 |
self.load_model(model_name)
|
53 |
|
|
|
|
|
|
|
54 |
# プロンプト構築
|
55 |
prompt = self._build_prompt(message, history)
|
56 |
|
57 |
# トークナイズ
|
58 |
-
inputs = self.tokenizer.encode(prompt, return_tensors="pt").to(
|
59 |
|
60 |
# 生成
|
61 |
with torch.no_grad():
|
@@ -71,6 +76,11 @@ class ChatBot:
|
|
71 |
|
72 |
# デコード
|
73 |
response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
74 |
return response.strip()
|
75 |
|
76 |
def _build_prompt(self, message: str, history: List[Tuple[str, str]]) -> str:
|
@@ -143,7 +153,7 @@ with gr.Blocks(title="ChatGPT Clone", theme=gr.themes.Soft()) as app:
|
|
143 |
model_select = gr.Dropdown(
|
144 |
choices=[
|
145 |
"elyza/Llama-3-ELYZA-JP-8B",
|
146 |
-
"
|
147 |
],
|
148 |
value="elyza/Llama-3-ELYZA-JP-8B",
|
149 |
label="モデル選択",
|
@@ -176,7 +186,8 @@ with gr.Blocks(title="ChatGPT Clone", theme=gr.themes.Soft()) as app:
|
|
176 |
|
177 |
### 注意事項
|
178 |
- 初回のモデル読み込みには時間がかかります
|
179 |
-
-
|
|
|
180 |
""")
|
181 |
|
182 |
# イベントハンドラ
|
|
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
import os
|
5 |
from typing import List, Tuple
|
6 |
+
import spaces
|
7 |
|
8 |
# Hugging Face token from environment variable
|
9 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
13 |
self.model = None
|
14 |
self.tokenizer = None
|
15 |
self.current_model = None
|
|
|
16 |
|
17 |
def load_model(self, model_name: str):
|
18 |
"""モデルとトークナイザーをロード"""
|
19 |
+
if self.current_model == model_name and self.model is not None:
|
20 |
return
|
21 |
|
22 |
# メモリクリア
|
23 |
if self.model is not None:
|
24 |
del self.model
|
|
|
25 |
torch.cuda.empty_cache()
|
26 |
|
27 |
+
# トークナイザーロード
|
28 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
29 |
model_name,
|
30 |
use_auth_token=HF_TOKEN,
|
31 |
trust_remote_code=True
|
32 |
)
|
33 |
|
34 |
+
# パッドトークンの設定
|
35 |
+
if self.tokenizer.pad_token is None:
|
36 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
37 |
+
|
38 |
+
# モデルロード(ZeroGPU対応)
|
39 |
self.model = AutoModelForCausalLM.from_pretrained(
|
40 |
model_name,
|
41 |
use_auth_token=HF_TOKEN,
|
42 |
+
torch_dtype=torch.float16,
|
43 |
+
low_cpu_mem_usage=True,
|
44 |
trust_remote_code=True
|
45 |
)
|
|
|
|
|
|
|
46 |
|
47 |
self.current_model = model_name
|
48 |
|
49 |
+
@spaces.GPU(duration=60)
|
50 |
def generate_response(self, message: str, history: List[Tuple[str, str]], model_name: str,
|
51 |
temperature: float = 0.7, max_tokens: int = 512) -> str:
|
52 |
"""メッセージに対する応答を生成"""
|
53 |
# モデルロード
|
54 |
self.load_model(model_name)
|
55 |
|
56 |
+
# GPUに移動
|
57 |
+
self.model.to('cuda')
|
58 |
+
|
59 |
# プロンプト構築
|
60 |
prompt = self._build_prompt(message, history)
|
61 |
|
62 |
# トークナイズ
|
63 |
+
inputs = self.tokenizer.encode(prompt, return_tensors="pt").to('cuda')
|
64 |
|
65 |
# 生成
|
66 |
with torch.no_grad():
|
|
|
76 |
|
77 |
# デコード
|
78 |
response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
|
79 |
+
|
80 |
+
# CPUに戻す(メモリ節約)
|
81 |
+
self.model.to('cpu')
|
82 |
+
torch.cuda.empty_cache()
|
83 |
+
|
84 |
return response.strip()
|
85 |
|
86 |
def _build_prompt(self, message: str, history: List[Tuple[str, str]]) -> str:
|
|
|
153 |
model_select = gr.Dropdown(
|
154 |
choices=[
|
155 |
"elyza/Llama-3-ELYZA-JP-8B",
|
156 |
+
"cyberagent/open-calm-7b"
|
157 |
],
|
158 |
value="elyza/Llama-3-ELYZA-JP-8B",
|
159 |
label="モデル選択",
|
|
|
186 |
|
187 |
### 注意事項
|
188 |
- 初回のモデル読み込みには時間がかかります
|
189 |
+
- ZeroGPU使用により高速推論が可能
|
190 |
+
- 1回の生成は60秒以内に完了します
|
191 |
""")
|
192 |
|
193 |
# イベントハンドラ
|