推理解析器#

SGLang 支持为 DeepSeek R1 等推理模型从“普通”内容中解析出推理内容。DeepSeek R1。

支持的模型与解析器#

模型	推理标签	解析器
DeepSeek‑R1 系列	`<think>` … `</think>`	`deepseek-r1`
Qwen3 和 QwQ 系列	`<think>` … `</think>`	`qwen3`

使用方法#

启动服务器#

指定 --reasoning-parser 选项。

[1]:

import requests
from openai import OpenAI
from sglang.test.test_utils import is_in_ci

if is_in_ci():
    from patch import launch_server_cmd
else:
    from sglang.utils import launch_server_cmd

from sglang.utils import wait_for_server, print_highlight, terminate_process


server_process, port = launch_server_cmd(
    "python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1"
)

wait_for_server(f"https://:{port}")

[2025-05-15 22:32:32] server_args=ServerArgs(model_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', tokenizer_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', chat_template=None, completion_template=None, is_embedding=False, enable_multimodal=None, revision=None, host='0.0.0.0', port=31754, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=952299064, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, bucket_time_to_first_token=None, bucket_e2e_request_latency=None, bucket_inter_token_latency=None, collect_tokens_histogram=False, decode_log_interval=40, enable_request_time_stats_logging=False, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser='deepseek-r1', dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_ep_moe=False, enable_deepep_moe=False, deepep_mode='auto', enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=None, cuda_graph_bs=None, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, warmups=None, moe_dense_tp_size=None, n_share_experts_fusion=0, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, mm_attention_backend=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_bootstrap_port=8998, disaggregation_transfer_backend='mooncake', disaggregation_ib_device=None, pdlb_url=None)
[2025-05-15 22:32:40] Attention backend not set. Use fa3 backend by default.
[2025-05-15 22:32:40] Init torch distributed begin.
[2025-05-15 22:32:40] Init torch distributed ends. mem usage=0.00 GB
[2025-05-15 22:32:40] Load weight begin. avail mem=48.05 GB
[2025-05-15 22:32:41] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:02<00:02,  2.69s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:26<00:00, 14.86s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:26<00:00, 13.03s/it]

[2025-05-15 22:33:08] Load weight end. type=Qwen2ForCausalLM, dtype=torch.bfloat16, avail mem=45.34 GB, mem usage=2.71 GB.
[2025-05-15 22:33:08] KV Cache is allocated. #tokens: 20480, K size: 0.55 GB, V size: 0.55 GB
[2025-05-15 22:33:08] Memory pool end. avail mem=43.97 GB
[2025-05-15 22:33:09] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
[2025-05-15 22:33:09] INFO:     Started server process [54701]
[2025-05-15 22:33:09] INFO:     Waiting for application startup.
[2025-05-15 22:33:09] INFO:     Application startup complete.
[2025-05-15 22:33:09] INFO:     Uvicorn running on http://0.0.0.0:31754 (Press CTRL+C to quit)
[2025-05-15 22:33:10] INFO:     127.0.0.1:60744 - "GET /v1/models HTTP/1.1" 200 OK
[2025-05-15 22:33:10] INFO:     127.0.0.1:46806 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-05-15 22:33:11] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:12] INFO:     127.0.0.1:46812 - "POST /generate HTTP/1.1" 200 OK
[2025-05-15 22:33:12] The server is fired up and ready to roll!

注意：通常情况下，服务器在单独的终端中运行。
在此 Notebook 中，我们将服务器和 Notebook 代码一起运行，因此它们的输出会合并。
为了提高清晰度，服务器日志以原始黑色显示，而 Notebook 输出则以蓝色突出显示。
我们正在 CI 并行环境中运行这些 Notebook，因此吞吐量并不能代表实际性能。

请注意，--reasoning-parser 定义了用于解释响应的解析器。

兼容 OpenAI 的 API#

使用兼容 OpenAI 的 API 时，协议遵循 DeepSeek API 设计，该设计随 DeepSeek-R1 的发布而建立。

reasoning_content：CoT 的内容。
content：最终答案的内容。

[2]:

# Initialize OpenAI-like client
client = OpenAI(api_key="None", base_url=f"http://0.0.0.0:{port}/v1")
model_name = client.models.list().data[0].id

messages = [
    {
        "role": "user",
        "content": "What is 1+3?",
    }
]

[2025-05-15 22:33:15] INFO:     127.0.0.1:46820 - "GET /v1/models HTTP/1.1" 200 OK

非流式请求#

[3]:

response_non_stream = client.chat.completions.create(
    model=model_name,
    messages=messages,
    temperature=0.6,
    top_p=0.95,
    stream=False,  # Non-streaming
    extra_body={"separate_reasoning": True},
)
print_highlight("==== Reasoning ====")
print_highlight(response_non_stream.choices[0].message.reasoning_content)

print_highlight("==== Text ====")
print_highlight(response_non_stream.choices[0].message.content)

[2025-05-15 22:33:15] Prefill batch. #new-seq: 1, #new-token: 11, #cached-token: 1, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:16] Decode batch. #running-req: 1, #token: 45, token usage: 0.00, cuda graph: False, gen throughput (token/s): 6.16, #queue-req: 0
[2025-05-15 22:33:16] Decode batch. #running-req: 1, #token: 85, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.69, #queue-req: 0
[2025-05-15 22:33:17] Decode batch. #running-req: 1, #token: 125, token usage: 0.01, cuda graph: False, gen throughput (token/s): 64.60, #queue-req: 0
[2025-05-15 22:33:17] Decode batch. #running-req: 1, #token: 165, token usage: 0.01, cuda graph: False, gen throughput (token/s): 68.91, #queue-req: 0
[2025-05-15 22:33:18] INFO:     127.0.0.1:46820 - "POST /v1/chat/completions HTTP/1.1" 200 OK

==== 推理 ===="

首先，我认识到问题要求计算数字 1 和 3 的和。

为了解决这个问题，我首先确定涉及的两个数字，它们是 1 和 3。

接着，我通过将这两个数字相加来执行加法运算。

最后，1 加 3 的结果是 4。

==== 文本 ===="

**解决方案：**

要找到 \(1\) 和 \(3\) 的和，请按照以下简单步骤操作

1. **确定要相加的数字：**
\[
1 \quad \text{and} \quad 3
\]

2. **执行加法运算：**
\[
1 + 3
\]

3. **计算结果：**
\[
1 + 3 = 4
\]

**最终答案：**
\[
\boxed{4}
\]

流式请求#

[4]:

response_stream = client.chat.completions.create(
    model=model_name,
    messages=messages,
    temperature=0.6,
    top_p=0.95,
    stream=True,  # Non-streaming
    extra_body={"separate_reasoning": True},
)

reasoning_content = ""
content = ""
for chunk in response_stream:
    if chunk.choices[0].delta.content:
        content += chunk.choices[0].delta.content
    if chunk.choices[0].delta.reasoning_content:
        reasoning_content += chunk.choices[0].delta.reasoning_content

print_highlight("==== Reasoning ====")
print_highlight(reasoning_content)

print_highlight("==== Text ====")
print_highlight(content)

[2025-05-15 22:33:18] INFO:     127.0.0.1:46820 - "POST /v1/chat/completions HTTP/1.1" 200 OK
[2025-05-15 22:33:18] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 11, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:18] Decode batch. #running-req: 1, #token: 18, token usage: 0.00, cuda graph: False, gen throughput (token/s): 62.03, #queue-req: 0
[2025-05-15 22:33:19] Decode batch. #running-req: 1, #token: 58, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.58, #queue-req: 0
[2025-05-15 22:33:19] Decode batch. #running-req: 1, #token: 98, token usage: 0.00, cuda graph: False, gen throughput (token/s): 68.05, #queue-req: 0

==== 推理 ===="

首先，我认识到问题要求计算 1 和 3 的和。

接着，我将这两个数字相加：1 加 3 等于 4。

最后，我将结果作为答案呈现。

==== 文本 ===="

**解决方案：**

我们被要求找到 1 和 3 的和。

\[
1 + 3 = 4
\]

**答案：** \boxed{4}

或者，您可以将推理内容缓冲到最后一个推理块（或推理内容之后的第一个块）。

[5]:

response_stream = client.chat.completions.create(
    model=model_name,
    messages=messages,
    temperature=0.6,
    top_p=0.95,
    stream=True,  # Non-streaming
    extra_body={"separate_reasoning": True, "stream_reasoning": False},
)

reasoning_content = ""
content = ""
for chunk in response_stream:
    if chunk.choices[0].delta.content:
        content += chunk.choices[0].delta.content
    if chunk.choices[0].delta.reasoning_content:
        reasoning_content = chunk.choices[0].delta.reasoning_content

print_highlight("==== Reasoning ====")
print_highlight(reasoning_content)

print_highlight("==== Text ====")
print_highlight(content)

[2025-05-15 22:33:19] INFO:     127.0.0.1:46820 - "POST /v1/chat/completions HTTP/1.1" 200 OK
[2025-05-15 22:33:19] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 11, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:20] Decode batch. #running-req: 1, #token: 50, token usage: 0.00, cuda graph: False, gen throughput (token/s): 55.56, #queue-req: 0
[2025-05-15 22:33:20] Decode batch. #running-req: 1, #token: 90, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.25, #queue-req: 0
[2025-05-15 22:33:21] Decode batch. #running-req: 1, #token: 130, token usage: 0.01, cuda graph: False, gen throughput (token/s): 67.38, #queue-req: 0

==== 推理 ===="

首先，我认识到问题要求计算数字 1 和 3 的和。

接着，我将这两个数字相加：1 加 3 等于 4。

因此，最终答案是 4。

==== 文本 ===="

**解决方案：**

我们被要求找到 \(1\) 和 \(3\) 的和。

1. **确定要相加的数字：**
\[
1 \quad \text{and} \quad 3
\]

2. **将数字相加：**
\[
1 + 3 = 4
\]

3. **呈现最终答案：**
\[
\boxed{4}
\]

使用推理解析器时，推理分离默认启用。 要禁用它，请在请求中将 ``separate_reasoning`` 选项设置为 ``False``。

[6]:

response_non_stream = client.chat.completions.create(
    model=model_name,
    messages=messages,
    temperature=0.6,
    top_p=0.95,
    stream=False,  # Non-streaming
    extra_body={"separate_reasoning": False},
)

print_highlight("==== Original Output ====")
print_highlight(response_non_stream.choices[0].message.content)

[2025-05-15 22:33:22] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 11, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:22] Decode batch. #running-req: 1, #token: 21, token usage: 0.00, cuda graph: False, gen throughput (token/s): 59.97, #queue-req: 0
[2025-05-15 22:33:22] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.71, #queue-req: 0
[2025-05-15 22:33:23] Decode batch. #running-req: 1, #token: 101, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.86, #queue-req: 0
[2025-05-15 22:33:24] Decode batch. #running-req: 1, #token: 141, token usage: 0.01, cuda graph: False, gen throughput (token/s): 67.92, #queue-req: 0
[2025-05-15 22:33:24] INFO:     127.0.0.1:46820 - "POST /v1/chat/completions HTTP/1.1" 200 OK

==== 原始输出 ===="

首先，我认识到问题要求计算 1 和 3 的和。

接着，我执行这两个数字的加法运算。

最后，我得到结果，即 4。

**解决方案：**

我们被要求找到 1 和 3 的和。

1. **确定要相加的数字：**
\[
1 \quad \text{and} \quad 3
\]

2. **执行加法运算：**
\[
1 + 3 = 4
\]

**最终答案：**
\[
\boxed{4}
\]

SGLang 原生 API#

[7]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
input = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

gen_url = f"https://:{port}/generate"
gen_data = {
    "text": input,
    "sampling_params": {
        "skip_special_tokens": False,
        "max_new_tokens": 1024,
        "temperature": 0.6,
        "top_p": 0.95,
    },
}
gen_response = requests.post(gen_url, json=gen_data).json()["text"]

print_highlight("==== Original Output ====")
print_highlight(gen_response)

parse_url = f"https://:{port}/separate_reasoning"
separate_reasoning_data = {
    "text": gen_response,
    "reasoning_parser": "deepseek-r1",
}
separate_reasoning_response_json = requests.post(
    parse_url, json=separate_reasoning_data
).json()
print_highlight("==== Reasoning ====")
print_highlight(separate_reasoning_response_json["reasoning_text"])
print_highlight("==== Text ====")
print_highlight(separate_reasoning_response_json["text"])

[2025-05-15 22:33:24] Prefill batch. #new-seq: 1, #new-token: 12, #cached-token: 1, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:25] Decode batch. #running-req: 1, #token: 48, token usage: 0.00, cuda graph: False, gen throughput (token/s): 33.62, #queue-req: 0
[2025-05-15 22:33:25] Decode batch. #running-req: 1, #token: 88, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.64, #queue-req: 0
[2025-05-15 22:33:25] INFO:     127.0.0.1:44418 - "POST /generate HTTP/1.1" 200 OK

==== 原始输出 ===="

首先，我认识到需要将数字 1 和 3 相加。

接着，我执行这两个数字的加法运算。

最后，我得到总和 4。

**解决方案：**

我们需要找到 1 和 3 的和。

\[
1 + 3 = 4
\]

**答案：** \boxed{4}

[2025-05-15 22:33:25] INFO:     127.0.0.1:44422 - "POST /separate_reasoning HTTP/1.1" 200 OK

==== 推理 ===="

首先，我认识到需要将数字 1 和 3 相加。

接着，我执行这两个数字的加法运算。

最后，我得到总和 4。

==== 文本 ===="

**解决方案：**

我们需要找到 1 和 3 的和。

\[
1 + 3 = 4
\]

**答案：** \boxed{4}

[8]:

terminate_process(server_process)

[2025-05-15 22:33:25] Child process unexpectedly failed with an exit code 9. pid=54848

离线引擎 API#

[9]:

import sglang as sgl
from sglang.srt.reasoning_parser import ReasoningParser
from sglang.utils import print_highlight

llm = sgl.Engine(model_path="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
input = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
sampling_params = {
    "max_new_tokens": 1024,
    "skip_special_tokens": False,
    "temperature": 0.6,
    "top_p": 0.95,
}
result = llm.generate(prompt=input, sampling_params=sampling_params)

generated_text = result["text"]  # Assume there is only one prompt

print_highlight("==== Original Output ====")
print_highlight(generated_text)

parser = ReasoningParser("deepseek-r1")
reasoning_text, text = parser.parse_non_stream(generated_text)
print_highlight("==== Reasoning ====")
print_highlight(reasoning_text)
print_highlight("==== Text ====")
print_highlight(text)

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:03<00:03,  3.05s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:07<00:00,  3.69s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:07<00:00,  3.60s/it]

==== 原始输出 ===="

首先，我认识到问题要求计算 1 和 3 的和。

接着，我通过组合这两个数字来执行加法运算。

最后，我得到结果，即 4。

好的！让我们一步一步解决加法问题。

**问题：**
\(1 + 3\) 是多少？

**解决方案：**

1. **从第一个数字开始：**
\(1\)

2. **加上第二个数字：**
\(1 + 3\)

3. **计算总和：**
\(1 + 3 = 4\)

**最终答案：**
\(\boxed{4}\)

==== 推理 ===="

首先，我认识到问题要求计算 1 和 3 的和。

接着，我通过组合这两个数字来执行加法运算。

最后，我得到结果，即 4。

==== 文本 ===="

好的！让我们一步一步解决加法问题。

**问题：**
\(1 + 3\) 是多少？

**解决方案：**

1. **从第一个数字开始：**
\(1\)

2. **加上第二个数字：**
\(1 + 3\)

3. **计算总和：**
\(1 + 3 = 4\)

**最终答案：**
\(\boxed{4}\)

[10]:

llm.shutdown()

支持新的推理模型 Schema#

对于未来的推理模型，您可以在 python/sglang/srt/reasoning_parser.py 中将推理解析器实现为 BaseReasoningFormatDetector 的子类，并相应地为新的推理模型 Schema 指定推理解析器。

class DeepSeekR1Detector(BaseReasoningFormatDetector):
    """
    Detector for DeepSeek-R1 model.
    Assumes reasoning format:
      (<think>)*(.*)</think>
    Returns all the text before the </think> tag as `reasoning_text`
    and the rest of the text as `normal_text`.

    Args:
        stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
            If True, streams reasoning content as it arrives.
    """

    def __init__(self, stream_reasoning: bool = False):
        # DeepSeek-R1 is assumed to be reasoning until `</think>` token
        super().__init__("<think>", "</think>", True, stream_reasoning=stream_reasoning)
        # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599


class ReasoningParser:
    """
    Parser that handles both streaming and non-streaming scenarios for extracting
    reasoning content from model outputs.

    Args:
        model_type (str): Type of model to parse reasoning from
        stream_reasoning (bool): If False, accumulates reasoning content until complete.
            If True, streams reasoning content as it arrives.
    """

    DetectorMap: Dict[str, BaseReasoningFormatDetector] = {
        "deepseek-r1": DeepSeekR1Detector
    }

    def __init__(self, model_type: str = None, stream_reasoning: bool = True):
        if not model_type:
            raise ValueError("Model type must be specified")

        detector_class = self.DetectorMap.get(model_type.lower())
        if not detector_class:
            raise ValueError(f"Unsupported model type: {model_type}")

        self.detector = detector_class(stream_reasoning=stream_reasoning)

    def parse_non_stream(self, full_text: str) -> StreamingParseResult:
        """Non-streaming call: one-time parsing"""
        ret = self.detector.detect_and_parse(full_text)
        return ret.reasoning_text, ret.normal_text

    def parse_stream_chunk(self, chunk_text: str) -> StreamingParseResult:
        """Streaming call: incremental parsing"""
        ret = self.detector.parse_streaming_increment(chunk_text)
        return ret.reasoning_text, ret.normal_text

推理解析器

目录

推理解析器#

支持的模型与解析器#

使用方法#

启动服务器#

兼容 OpenAI 的 API#

非流式请求#

流式请求#

SGLang 原生 API#

离线引擎 API#

支持新的推理模型 Schema#