from flask import Flask, request, jsonify from transformers import AutoTokenizer, AutoModelForCausalLM import torch app = Flask(__name__) # Используем квантованную модель для экономии памяти model_name = "Qwen/Qwen-1_8B-Chat-Int4" # Загружаем модель и токенизатор tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True ) @app.route("/v1/chat/completions", methods=["POST"]) def chat(): data = request.json prompt = data.get("messages", "")[-1]["content"] # Генерируем ответ inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=200) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Возвращаем ответ в формате OpenAI API return jsonify({ "choices": [ { "message": { "content": response } } ] }) if __name__ == "__main__": app.run()