model overthinking
i even disabled thinking and still the model response is too slow compared to gpt-oss-120b
my cmd to start is
cmd = [
sys.executable, '-m', 'vllm.entrypoints.openai.api_server',
'--seed', str(self.cfg.seed),
'--model', self.cfg.model_path,
'--served-model-name', self.cfg.served_model_name,
'--tensor-parallel-size', '1',
'--max-num-seqs', str(self.cfg.batch_size),
'--gpu-memory-utilization', str(self.cfg.gpu_memory_utilization),
'--host', '0.0.0.0',
'--port', str(self.port),
'--dtype', self.cfg.dtype,
'--kv-cache-dtype', self.cfg.kv_cache_dtype,
'--max-model-len', str(self.cfg.context_tokens),
'--stream-interval', str(self.cfg.stream_interval),
# '--async-scheduling',
'--disable-log-stats',
'--enable-prefix-caching',
'--trust-remote-code',
'--enable-chunked-prefill',
'--language-model-only',
#'--speculative-config','{"method": "mtp", "num_speculative_tokens": 1}',
'--enforce-eager',
'--reasoning-parser', 'qwen3',
]
return subprocess.Popen(
cmd, stdout=self.log_file, stderr=subprocess.STDOUT, start_new_session=True
)
and to use
extra_body = {
"chat_template_kwargs": {"enable_thinking": self.cfg.enable_thinking},
"top_k": 20,
}
stream = self.client.chat.completions.create(
model=self.model_name,
messages=messages,
max_tokens=max_tokens,
temperature=self.cfg.temperature,
top_p=0.95,
presence_penalty=1.5,
logprobs=False,
stop=self.cfg.stop_strings,
seed=seed,
stream=True,
extra_body=extra_body,
)
https://drive.google.com/file/d/1u53vaR-HOicoGsPcsIQx4IGbwhDSIr1h/view?usp=sharing
use vllm with latest build will resolve this issue on thinking control