model overthinking

#20
by cse2011 - opened

i even disabled thinking and still the model response is too slow compared to gpt-oss-120b
my cmd to start is

cmd = [
sys.executable, '-m', 'vllm.entrypoints.openai.api_server',
'--seed', str(self.cfg.seed),
'--model', self.cfg.model_path,
'--served-model-name', self.cfg.served_model_name,
'--tensor-parallel-size', '1',
'--max-num-seqs', str(self.cfg.batch_size),
'--gpu-memory-utilization', str(self.cfg.gpu_memory_utilization),
'--host', '0.0.0.0',
'--port', str(self.port),
'--dtype', self.cfg.dtype,
'--kv-cache-dtype', self.cfg.kv_cache_dtype,
'--max-model-len', str(self.cfg.context_tokens),
'--stream-interval', str(self.cfg.stream_interval),
# '--async-scheduling',
'--disable-log-stats',
'--enable-prefix-caching',
'--trust-remote-code',
'--enable-chunked-prefill',
'--language-model-only',
#'--speculative-config','{"method": "mtp", "num_speculative_tokens": 1}',
'--enforce-eager',
'--reasoning-parser', 'qwen3',
]
return subprocess.Popen(
cmd, stdout=self.log_file, stderr=subprocess.STDOUT, start_new_session=True
)

and to use
extra_body = {

        "chat_template_kwargs": {"enable_thinking": self.cfg.enable_thinking},
        "top_k": 20,
        
    }
    stream = self.client.chat.completions.create(
        model=self.model_name,
        messages=messages,
        max_tokens=max_tokens,
        temperature=self.cfg.temperature,
        top_p=0.95,
        presence_penalty=1.5,
        logprobs=False,
        stop=self.cfg.stop_strings,
        seed=seed,
        stream=True,
        extra_body=extra_body,
    )

https://drive.google.com/file/d/1u53vaR-HOicoGsPcsIQx4IGbwhDSIr1h/view?usp=sharing

use vllm with latest build will resolve this issue on thinking control

Sign up or log in to comment