Spaces:

doropiza
/

llm_chat_app

Running on Zero

App Files Files Community

ryoshimu commited on Jul 11

Commit

2445678

1 Parent(s): 06de6e5

commit

Browse files

Files changed (1) hide show

app.py +88 -36

app.py CHANGED Viewed

@@ -1,3 +1,12 @@
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -7,12 +16,18 @@ from typing import List, Tuple
 # Hugging Face token from environment variable
 HF_TOKEN = os.getenv("HF_TOKEN")
 # Check if running on ZeroGPU
 try:
     import spaces
     IS_ZEROGPU = True
 except ImportError:
     IS_ZEROGPU = False
 class ChatBot:
     def __init__(self):
@@ -25,32 +40,45 @@ class ChatBot:
         if self.current_model == model_name and self.model is not None:
             return
-        # メモリクリア
-        if self.model is not None:
-            del self.model
-            torch.cuda.empty_cache()
-        # トークナイザーロード
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            use_auth_token=HF_TOKEN,
-            trust_remote_code=True
-        )
-        # パッドトークンの設定
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        # モデルロード（ZeroGPU対応）
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            use_auth_token=HF_TOKEN,
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-            trust_remote_code=True
-        )
-        self.current_model = model_name
     def _generate_response_gpu(self, message: str, history: List[Tuple[str, str]], model_name: str,
                              temperature: float = 0.7, max_tokens: int = 512) -> str:
@@ -75,6 +103,8 @@ class ChatBot:
                 temperature=temperature,
                 do_sample=True,
                 top_p=0.95,
                 pad_token_id=self.tokenizer.pad_token_id,
                 eos_token_id=self.tokenizer.eos_token_id
             )
@@ -85,6 +115,7 @@ class ChatBot:
         # CPUに戻す（メモリ節約）
         self.model.to('cpu')
         torch.cuda.empty_cache()
         return response.strip()
@@ -112,6 +143,8 @@ class ChatBot:
                     temperature=temperature,
                     do_sample=True,
                     top_p=0.95,
                     pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id
                 )
@@ -123,8 +156,8 @@ class ChatBot:
         """会話履歴からプロンプトを構築"""
         prompt = ""
-        # 履歴を追加
-        for user_msg, assistant_msg in history[-5:]:  # 最新5件の履歴のみ使用
             prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n\n"
         # 現在のメッセージを追加
@@ -137,7 +170,7 @@ chatbot = ChatBot()
 # ZeroGPU環境の場合、GPUデコレータを適用
 if IS_ZEROGPU:
-    chatbot._generate_response_gpu = spaces.GPU(duration=60)(chatbot._generate_response_gpu)
 def respond(message: str, history: List[Tuple[str, str]], model_name: str,
             temperature: float, max_tokens: int) -> Tuple[List[Tuple[str, str]], str]:
@@ -152,6 +185,13 @@ def respond(message: str, history: List[Tuple[str, str]], model_name: str,
         # 履歴に追加
         history.append((message, response))
         return history, ""
     except Exception as e:
         error_msg = f"エラーが発生しました: {str(e)}"
@@ -165,7 +205,13 @@ def clear_chat() -> Tuple[List, str]:
 # Gradio UI
 with gr.Blocks(title="ChatGPT Clone", theme=gr.themes.Soft()) as app:
     gr.Markdown("# 🤖 ChatGPT Clone")
-    gr.Markdown("日本語対応のLLMを使用したチャットボットです。")
     with gr.Row():
         with gr.Column(scale=3):
@@ -192,10 +238,10 @@ with gr.Blocks(title="ChatGPT Clone", theme=gr.themes.Soft()) as app:
         with gr.Column(scale=1):
             model_select = gr.Dropdown(
                 choices=[
-                    "rinna/japanese-gpt2-medium",
-                    "cyberagent/open-calm-small"
                 ],
-                value="rinna/japanese-gpt2-medium",
                 label="モデル選択",
                 interactive=True
             )
@@ -211,8 +257,8 @@ with gr.Blocks(title="ChatGPT Clone", theme=gr.themes.Soft()) as app:
             max_tokens = gr.Slider(
                 minimum=64,
-                maximum=1024,
-                value=512,
                 step=64,
                 label="最大トークン数",
                 info="生成する最大トークン数"
@@ -227,7 +273,8 @@ with gr.Blocks(title="ChatGPT Clone", theme=gr.themes.Soft()) as app:
             ### 注意事項
             - 初回のモデル読み込みには時間がかかります
             - ZeroGPU使用により高速推論が可能
-            - 1回の生成は60秒以内に完了します
             """)
     # イベントハンドラ
@@ -249,7 +296,12 @@ with gr.Blocks(title="ChatGPT Clone", theme=gr.themes.Soft()) as app:
     )
 if __name__ == "__main__":
     app.launch(
         share=False,
-        show_error=True
     )

+"""
+ChatGPT Clone - 日本語対応チャットボット
+Hugging Face Spaces (ZeroGPU) 対応版
+使用モデル:
+- elyza/Llama-3-ELYZA-JP-8B
+- Fugaku-LLM/Fugaku-LLM-13B
+"""
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 # Hugging Face token from environment variable
 HF_TOKEN = os.getenv("HF_TOKEN")
+# トークンのチェック
+if not HF_TOKEN:
+    print("警告: HF_TOKENが設定されていません。プライベートモデルへのアクセスが制限される場合があります。")
 # Check if running on ZeroGPU
 try:
     import spaces
     IS_ZEROGPU = True
+    print("ZeroGPU環境を検出しました。")
 except ImportError:
     IS_ZEROGPU = False
+    print("通常のGPU/CPU環境で実行しています。")
 class ChatBot:
     def __init__(self):
         if self.current_model == model_name and self.model is not None:
             return
+        try:
+            # メモリクリア
+            if self.model is not None:
+                del self.model
+                del self.tokenizer
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    torch.cuda.synchronize()
+            # トークナイザーロード
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                token=HF_TOKEN,
+                trust_remote_code=True,
+                padding_side="left"
+            )
+            # パッドトークンの設定
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+                self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+            # モデルロード（ZeroGPU対応）
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                token=HF_TOKEN,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+                load_in_8bit=False,  # ZeroGPU環境では8bit量子化は使わない
+                device_map=None  # ZeroGPU環境では自動マッピングしない
+            )
+            self.current_model = model_name
+            print(f"モデル {model_name} のロードが完了しました。")
+        except Exception as e:
+            print(f"モデルのロード中にエラーが発生しました: {str(e)}")
+            raise
     def _generate_response_gpu(self, message: str, history: List[Tuple[str, str]], model_name: str,
                              temperature: float = 0.7, max_tokens: int = 512) -> str:
                 temperature=temperature,
                 do_sample=True,
                 top_p=0.95,
+                top_k=50,
+                repetition_penalty=1.1,
                 pad_token_id=self.tokenizer.pad_token_id,
                 eos_token_id=self.tokenizer.eos_token_id
             )
         # CPUに戻す（メモリ節約）
         self.model.to('cpu')
         torch.cuda.empty_cache()
+        torch.cuda.synchronize()
         return response.strip()
                     temperature=temperature,
                     do_sample=True,
                     top_p=0.95,
+                    top_k=50,
+                    repetition_penalty=1.1,
                     pad_token_id=self.tokenizer.pad_token_id,
                     eos_token_id=self.tokenizer.eos_token_id
                 )
         """会話履歴からプロンプトを構築"""
         prompt = ""
+        # 履歴を追加（最新3件のみ使用 - メモリ効率のため）
+        for user_msg, assistant_msg in history[-3:]:
             prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n\n"
         # 現在のメッセージを追加
 # ZeroGPU環境の場合、GPUデコレータを適用
 if IS_ZEROGPU:
+    chatbot._generate_response_gpu = spaces.GPU(duration=120)(chatbot._generate_response_gpu)
 def respond(message: str, history: List[Tuple[str, str]], model_name: str,
             temperature: float, max_tokens: int) -> Tuple[List[Tuple[str, str]], str]:
         # 履歴に追加
         history.append((message, response))
+        return history, ""
+    except RuntimeError as e:
+        if "out of memory" in str(e).lower():
+            error_msg = "メモリ不足エラー: より小さいモデルを使用するか、最大トークン数を減らしてください。"
+        else:
+            error_msg = f"実行時エラー: {str(e)}"
+        history.append((message, error_msg))
         return history, ""
     except Exception as e:
         error_msg = f"エラーが発生しました: {str(e)}"
 # Gradio UI
 with gr.Blocks(title="ChatGPT Clone", theme=gr.themes.Soft()) as app:
     gr.Markdown("# 🤖 ChatGPT Clone")
+    gr.Markdown("""
+    日本語対応のLLMを使用したチャットボットです。
+    **使用可能モデル:**
+    - [elyza/Llama-3-ELYZA-JP-8B](https://huggingface.co/elyza/Llama-3-ELYZA-JP-8B)
+    - [Fugaku-LLM/Fugaku-LLM-13B](https://huggingface.co/Fugaku-LLM/Fugaku-LLM-13B)
+    """)
     with gr.Row():
         with gr.Column(scale=3):
         with gr.Column(scale=1):
             model_select = gr.Dropdown(
                 choices=[
+                    "elyza/Llama-3-ELYZA-JP-8B",
+                    "Fugaku-LLM/Fugaku-LLM-13B"
                 ],
+                value="elyza/Llama-3-ELYZA-JP-8B",
                 label="モデル選択",
                 interactive=True
             )
             max_tokens = gr.Slider(
                 minimum=64,
+                maximum=512,
+                value=256,
                 step=64,
                 label="最大トークン数",
                 info="生成する最大トークン数"
             ### 注意事項
             - 初回のモデル読み込みには時間がかかります
             - ZeroGPU使用により高速推論が可能
+            - 1回の生成は120秒以内に完了します
+            - 大きなモデル使用時は、短めの応答になる場合があります
             """)
     # イベントハンドラ
     )
 if __name__ == "__main__":
+    # Hugging Face Spaces環境かどうかを確認
+    is_hf_spaces = os.getenv("SPACE_ID") is not None
     app.launch(
         share=False,
+        show_error=True,
+        server_name="0.0.0.0" if is_hf_spaces else "127.0.0.1",
+        server_port=7860
     )