File size: 2,369 Bytes
128d10c
 
615e784
128d10c
 
615e784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aff1b1a
 
615e784
0b1894c
 
aff1b1a
 
615e784
 
 
0b1894c
615e784
 
0b1894c
615e784
0b1894c
aff1b1a
615e784
0b1894c
 
 
615e784
aff1b1a
615e784
aff1b1a
 
615e784
0b1894c
615e784
 
0b1894c
 
 
 
 
 
615e784
 
0b1894c
615e784
aff1b1a
 
 
 
615e784
0b1894c
 
615e784
 
aff1b1a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#Use this script to chat with "mistral-7b-medical-o1-ft" that answers your questions until you type '\q' or 'quit' to end the conversation.


# !pip install unsloth #(install unsloth if not installed)
 
from unsloth import FastLanguageModel
import torch


# Define the Alpaca prompt template and load model
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction: {instruction}
### Input: {input_text}
### Response: {output}
"""

# Load your model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Subh775/mistral-7b-medical-o1-ft",
    max_seq_length=2048,
    load_in_4bit=True
)

# Enable optimized inference mode for faster generation
FastLanguageModel.for_inference(model)


# Function to handle the chat loop with memory

def chat():
    print("Chat with the model! Type '\\q' or 'quit' to stop.\n")

    chat_history = ""  # Store the conversation history

    while True:
        # Get user input
        user_input = input("You: ")

        # Exit condition
        if user_input.lower() in ['\\q', 'quit']:
            print("\nExiting the chat. Goodbye!")
            break

        # Append the current input to chat history with instruction formatting
        prompt = alpaca_prompt.format(
            instruction="Please answer the following medical question.",
            input_text=user_input,
            output=""
        )
        chat_history += prompt + "\n"

        # Tokenize combined history and move to GPU
        inputs = tokenizer([chat_history], return_tensors="pt").to("cuda")

        # Generate output with configured parameters
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            num_return_sequences=1,
            do_sample=True,
            no_repeat_ngram_size=2
        )

        # Decode and clean the model's response
        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        clean_output = decoded_output[0].split('### Response:')[-1].strip()

        # Add the response to chat history
        chat_history += f": {clean_output}\n"

        # Display the response
        print(f"\nModel: {clean_output}\n")

# Start the chat
chat()