File size: 2,369 Bytes
128d10c 615e784 128d10c 615e784 aff1b1a 615e784 0b1894c aff1b1a 615e784 0b1894c 615e784 0b1894c 615e784 0b1894c aff1b1a 615e784 0b1894c 615e784 aff1b1a 615e784 aff1b1a 615e784 0b1894c 615e784 0b1894c 615e784 0b1894c 615e784 aff1b1a 615e784 0b1894c 615e784 aff1b1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
#Use this script to chat with "mistral-7b-medical-o1-ft" that answers your questions until you type '\q' or 'quit' to end the conversation.
# !pip install unsloth #(install unsloth if not installed)
from unsloth import FastLanguageModel
import torch
# Define the Alpaca prompt template and load model
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction: {instruction}
### Input: {input_text}
### Response: {output}
"""
# Load your model
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="Subh775/mistral-7b-medical-o1-ft",
max_seq_length=2048,
load_in_4bit=True
)
# Enable optimized inference mode for faster generation
FastLanguageModel.for_inference(model)
# Function to handle the chat loop with memory
def chat():
print("Chat with the model! Type '\\q' or 'quit' to stop.\n")
chat_history = "" # Store the conversation history
while True:
# Get user input
user_input = input("You: ")
# Exit condition
if user_input.lower() in ['\\q', 'quit']:
print("\nExiting the chat. Goodbye!")
break
# Append the current input to chat history with instruction formatting
prompt = alpaca_prompt.format(
instruction="Please answer the following medical question.",
input_text=user_input,
output=""
)
chat_history += prompt + "\n"
# Tokenize combined history and move to GPU
inputs = tokenizer([chat_history], return_tensors="pt").to("cuda")
# Generate output with configured parameters
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
num_return_sequences=1,
do_sample=True,
no_repeat_ngram_size=2
)
# Decode and clean the model's response
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
clean_output = decoded_output[0].split('### Response:')[-1].strip()
# Add the response to chat history
chat_history += f": {clean_output}\n"
# Display the response
print(f"\nModel: {clean_output}\n")
# Start the chat
chat()
|