推理解析器#
SGLang 支持为 DeepSeek R1 等推理模型从“普通”内容中解析出推理内容。DeepSeek R1。
支持的模型与解析器#
模型 |
推理标签 |
解析器 |
---|---|---|
|
|
|
|
|
使用方法#
启动服务器#
指定 --reasoning-parser
选项。
[1]:
import requests
from openai import OpenAI
from sglang.test.test_utils import is_in_ci
if is_in_ci():
from patch import launch_server_cmd
else:
from sglang.utils import launch_server_cmd
from sglang.utils import wait_for_server, print_highlight, terminate_process
server_process, port = launch_server_cmd(
"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1"
)
wait_for_server(f"http://localhost:{port}")
[2025-05-15 22:32:32] server_args=ServerArgs(model_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', tokenizer_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', chat_template=None, completion_template=None, is_embedding=False, enable_multimodal=None, revision=None, host='0.0.0.0', port=31754, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=952299064, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, bucket_time_to_first_token=None, bucket_e2e_request_latency=None, bucket_inter_token_latency=None, collect_tokens_histogram=False, decode_log_interval=40, enable_request_time_stats_logging=False, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser='deepseek-r1', dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_ep_moe=False, enable_deepep_moe=False, deepep_mode='auto', enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=None, cuda_graph_bs=None, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, warmups=None, moe_dense_tp_size=None, n_share_experts_fusion=0, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, mm_attention_backend=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_bootstrap_port=8998, disaggregation_transfer_backend='mooncake', disaggregation_ib_device=None, pdlb_url=None)
[2025-05-15 22:32:40] Attention backend not set. Use fa3 backend by default.
[2025-05-15 22:32:40] Init torch distributed begin.
[2025-05-15 22:32:40] Init torch distributed ends. mem usage=0.00 GB
[2025-05-15 22:32:40] Load weight begin. avail mem=48.05 GB
[2025-05-15 22:32:41] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:02<00:02, 2.69s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:26<00:00, 14.86s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:26<00:00, 13.03s/it]
[2025-05-15 22:33:08] Load weight end. type=Qwen2ForCausalLM, dtype=torch.bfloat16, avail mem=45.34 GB, mem usage=2.71 GB.
[2025-05-15 22:33:08] KV Cache is allocated. #tokens: 20480, K size: 0.55 GB, V size: 0.55 GB
[2025-05-15 22:33:08] Memory pool end. avail mem=43.97 GB
[2025-05-15 22:33:09] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
[2025-05-15 22:33:09] INFO: Started server process [54701]
[2025-05-15 22:33:09] INFO: Waiting for application startup.
[2025-05-15 22:33:09] INFO: Application startup complete.
[2025-05-15 22:33:09] INFO: Uvicorn running on http://0.0.0.0:31754 (Press CTRL+C to quit)
[2025-05-15 22:33:10] INFO: 127.0.0.1:60744 - "GET /v1/models HTTP/1.1" 200 OK
[2025-05-15 22:33:10] INFO: 127.0.0.1:46806 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-05-15 22:33:11] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:12] INFO: 127.0.0.1:46812 - "POST /generate HTTP/1.1" 200 OK
[2025-05-15 22:33:12] The server is fired up and ready to roll!
注意:通常情况下,服务器在单独的终端中运行。
在此 Notebook 中,我们将服务器和 Notebook 代码一起运行,因此它们的输出会合并。
为了提高清晰度,服务器日志以原始黑色显示,而 Notebook 输出则以蓝色突出显示。
我们正在 CI 并行环境中运行这些 Notebook,因此吞吐量并不能代表实际性能。
请注意,--reasoning-parser
定义了用于解释响应的解析器。
兼容 OpenAI 的 API#
使用兼容 OpenAI 的 API 时,协议遵循 DeepSeek API 设计,该设计随 DeepSeek-R1 的发布而建立。
reasoning_content
:CoT 的内容。content
:最终答案的内容。
[2]:
# Initialize OpenAI-like client
client = OpenAI(api_key="None", base_url=f"http://0.0.0.0:{port}/v1")
model_name = client.models.list().data[0].id
messages = [
{
"role": "user",
"content": "What is 1+3?",
}
]
[2025-05-15 22:33:15] INFO: 127.0.0.1:46820 - "GET /v1/models HTTP/1.1" 200 OK
非流式请求#
[3]:
response_non_stream = client.chat.completions.create(
model=model_name,
messages=messages,
temperature=0.6,
top_p=0.95,
stream=False, # Non-streaming
extra_body={"separate_reasoning": True},
)
print_highlight("==== Reasoning ====")
print_highlight(response_non_stream.choices[0].message.reasoning_content)
print_highlight("==== Text ====")
print_highlight(response_non_stream.choices[0].message.content)
[2025-05-15 22:33:15] Prefill batch. #new-seq: 1, #new-token: 11, #cached-token: 1, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:16] Decode batch. #running-req: 1, #token: 45, token usage: 0.00, cuda graph: False, gen throughput (token/s): 6.16, #queue-req: 0
[2025-05-15 22:33:16] Decode batch. #running-req: 1, #token: 85, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.69, #queue-req: 0
[2025-05-15 22:33:17] Decode batch. #running-req: 1, #token: 125, token usage: 0.01, cuda graph: False, gen throughput (token/s): 64.60, #queue-req: 0
[2025-05-15 22:33:17] Decode batch. #running-req: 1, #token: 165, token usage: 0.01, cuda graph: False, gen throughput (token/s): 68.91, #queue-req: 0
[2025-05-15 22:33:18] INFO: 127.0.0.1:46820 - "POST /v1/chat/completions HTTP/1.1" 200 OK
==== 推理 ===="
首先,我认识到问题要求计算数字 1 和 3 的和。
为了解决这个问题,我首先确定涉及的两个数字,它们是 1 和 3。
接着,我通过将这两个数字相加来执行加法运算。
最后,1 加 3 的结果是 4。
为了解决这个问题,我首先确定涉及的两个数字,它们是 1 和 3。
接着,我通过将这两个数字相加来执行加法运算。
最后,1 加 3 的结果是 4。
==== 文本 ===="
**解决方案:**
要找到 \(1\) 和 \(3\) 的和,请按照以下简单步骤操作
1. **确定要相加的数字:**
\[
1 \quad \text{and} \quad 3
\]
2. **执行加法运算:**
\[
1 + 3
\]
3. **计算结果:**
\[
1 + 3 = 4
\]
**最终答案:**
\[
\boxed{4}
\]
要找到 \(1\) 和 \(3\) 的和,请按照以下简单步骤操作
1. **确定要相加的数字:**
\[
1 \quad \text{and} \quad 3
\]
2. **执行加法运算:**
\[
1 + 3
\]
3. **计算结果:**
\[
1 + 3 = 4
\]
**最终答案:**
\[
\boxed{4}
\]
流式请求#
[4]:
response_stream = client.chat.completions.create(
model=model_name,
messages=messages,
temperature=0.6,
top_p=0.95,
stream=True, # Non-streaming
extra_body={"separate_reasoning": True},
)
reasoning_content = ""
content = ""
for chunk in response_stream:
if chunk.choices[0].delta.content:
content += chunk.choices[0].delta.content
if chunk.choices[0].delta.reasoning_content:
reasoning_content += chunk.choices[0].delta.reasoning_content
print_highlight("==== Reasoning ====")
print_highlight(reasoning_content)
print_highlight("==== Text ====")
print_highlight(content)
[2025-05-15 22:33:18] INFO: 127.0.0.1:46820 - "POST /v1/chat/completions HTTP/1.1" 200 OK
[2025-05-15 22:33:18] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 11, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:18] Decode batch. #running-req: 1, #token: 18, token usage: 0.00, cuda graph: False, gen throughput (token/s): 62.03, #queue-req: 0
[2025-05-15 22:33:19] Decode batch. #running-req: 1, #token: 58, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.58, #queue-req: 0
[2025-05-15 22:33:19] Decode batch. #running-req: 1, #token: 98, token usage: 0.00, cuda graph: False, gen throughput (token/s): 68.05, #queue-req: 0
==== 推理 ===="
首先,我认识到问题要求计算 1 和 3 的和。
接着,我将这两个数字相加:1 加 3 等于 4。
最后,我将结果作为答案呈现。
接着,我将这两个数字相加:1 加 3 等于 4。
最后,我将结果作为答案呈现。
==== 文本 ===="
**解决方案:**
我们被要求找到 1 和 3 的和。
\[
1 + 3 = 4
\]
**答案:** \boxed{4}
或者,您可以将推理内容缓冲到最后一个推理块(或推理内容之后的第一个块)。
[5]:
response_stream = client.chat.completions.create(
model=model_name,
messages=messages,
temperature=0.6,
top_p=0.95,
stream=True, # Non-streaming
extra_body={"separate_reasoning": True, "stream_reasoning": False},
)
reasoning_content = ""
content = ""
for chunk in response_stream:
if chunk.choices[0].delta.content:
content += chunk.choices[0].delta.content
if chunk.choices[0].delta.reasoning_content:
reasoning_content = chunk.choices[0].delta.reasoning_content
print_highlight("==== Reasoning ====")
print_highlight(reasoning_content)
print_highlight("==== Text ====")
print_highlight(content)
[2025-05-15 22:33:19] INFO: 127.0.0.1:46820 - "POST /v1/chat/completions HTTP/1.1" 200 OK
[2025-05-15 22:33:19] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 11, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:20] Decode batch. #running-req: 1, #token: 50, token usage: 0.00, cuda graph: False, gen throughput (token/s): 55.56, #queue-req: 0
[2025-05-15 22:33:20] Decode batch. #running-req: 1, #token: 90, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.25, #queue-req: 0
[2025-05-15 22:33:21] Decode batch. #running-req: 1, #token: 130, token usage: 0.01, cuda graph: False, gen throughput (token/s): 67.38, #queue-req: 0
==== 推理 ===="
首先,我认识到问题要求计算数字 1 和 3 的和。
接着,我将这两个数字相加:1 加 3 等于 4。
因此,最终答案是 4。
接着,我将这两个数字相加:1 加 3 等于 4。
因此,最终答案是 4。
==== 文本 ===="
**解决方案:**
我们被要求找到 \(1\) 和 \(3\) 的和。
1. **确定要相加的数字:**
\[
1 \quad \text{and} \quad 3
\]
2. **将数字相加:**
\[
1 + 3 = 4
\]
3. **呈现最终答案:**
\[
\boxed{4}
\]
使用推理解析器时,推理分离默认启用。 要禁用它,请在请求中将 ``separate_reasoning`` 选项设置为 ``False``。
[6]:
response_non_stream = client.chat.completions.create(
model=model_name,
messages=messages,
temperature=0.6,
top_p=0.95,
stream=False, # Non-streaming
extra_body={"separate_reasoning": False},
)
print_highlight("==== Original Output ====")
print_highlight(response_non_stream.choices[0].message.content)
[2025-05-15 22:33:22] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 11, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:22] Decode batch. #running-req: 1, #token: 21, token usage: 0.00, cuda graph: False, gen throughput (token/s): 59.97, #queue-req: 0
[2025-05-15 22:33:22] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.71, #queue-req: 0
[2025-05-15 22:33:23] Decode batch. #running-req: 1, #token: 101, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.86, #queue-req: 0
[2025-05-15 22:33:24] Decode batch. #running-req: 1, #token: 141, token usage: 0.01, cuda graph: False, gen throughput (token/s): 67.92, #queue-req: 0
[2025-05-15 22:33:24] INFO: 127.0.0.1:46820 - "POST /v1/chat/completions HTTP/1.1" 200 OK
==== 原始输出 ===="
首先,我认识到问题要求计算 1 和 3 的和。
接着,我执行这两个数字的加法运算。
最后,我得到结果,即 4。
**解决方案:**
我们被要求找到 1 和 3 的和。
1. **确定要相加的数字:**
\[
1 \quad \text{and} \quad 3
\]
2. **执行加法运算:**
\[
1 + 3 = 4
\]
**最终答案:**
\[
\boxed{4}
\]
接着,我执行这两个数字的加法运算。
最后,我得到结果,即 4。
**解决方案:**
我们被要求找到 1 和 3 的和。
1. **确定要相加的数字:**
\[
1 \quad \text{and} \quad 3
\]
2. **执行加法运算:**
\[
1 + 3 = 4
\]
**最终答案:**
\[
\boxed{4}
\]
SGLang 原生 API#
[7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
input = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
gen_url = f"http://localhost:{port}/generate"
gen_data = {
"text": input,
"sampling_params": {
"skip_special_tokens": False,
"max_new_tokens": 1024,
"temperature": 0.6,
"top_p": 0.95,
},
}
gen_response = requests.post(gen_url, json=gen_data).json()["text"]
print_highlight("==== Original Output ====")
print_highlight(gen_response)
parse_url = f"http://localhost:{port}/separate_reasoning"
separate_reasoning_data = {
"text": gen_response,
"reasoning_parser": "deepseek-r1",
}
separate_reasoning_response_json = requests.post(
parse_url, json=separate_reasoning_data
).json()
print_highlight("==== Reasoning ====")
print_highlight(separate_reasoning_response_json["reasoning_text"])
print_highlight("==== Text ====")
print_highlight(separate_reasoning_response_json["text"])
[2025-05-15 22:33:24] Prefill batch. #new-seq: 1, #new-token: 12, #cached-token: 1, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-15 22:33:25] Decode batch. #running-req: 1, #token: 48, token usage: 0.00, cuda graph: False, gen throughput (token/s): 33.62, #queue-req: 0
[2025-05-15 22:33:25] Decode batch. #running-req: 1, #token: 88, token usage: 0.00, cuda graph: False, gen throughput (token/s): 67.64, #queue-req: 0
[2025-05-15 22:33:25] INFO: 127.0.0.1:44418 - "POST /generate HTTP/1.1" 200 OK
==== 原始输出 ===="
首先,我认识到需要将数字 1 和 3 相加。
接着,我执行这两个数字的加法运算。
最后,我得到总和 4。
**解决方案:**
我们需要找到 1 和 3 的和。
\[
1 + 3 = 4
\]
**答案:** \boxed{4}
接着,我执行这两个数字的加法运算。
最后,我得到总和 4。
**解决方案:**
我们需要找到 1 和 3 的和。
\[
1 + 3 = 4
\]
**答案:** \boxed{4}
[2025-05-15 22:33:25] INFO: 127.0.0.1:44422 - "POST /separate_reasoning HTTP/1.1" 200 OK
==== 推理 ===="
首先,我认识到需要将数字 1 和 3 相加。
接着,我执行这两个数字的加法运算。
最后,我得到总和 4。
接着,我执行这两个数字的加法运算。
最后,我得到总和 4。
==== 文本 ===="
**解决方案:**
我们需要找到 1 和 3 的和。
\[
1 + 3 = 4
\]
**答案:** \boxed{4}
我们需要找到 1 和 3 的和。
\[
1 + 3 = 4
\]
**答案:** \boxed{4}
[8]:
terminate_process(server_process)
[2025-05-15 22:33:25] Child process unexpectedly failed with an exit code 9. pid=54848
离线引擎 API#
[9]:
import sglang as sgl
from sglang.srt.reasoning_parser import ReasoningParser
from sglang.utils import print_highlight
llm = sgl.Engine(model_path="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
input = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
sampling_params = {
"max_new_tokens": 1024,
"skip_special_tokens": False,
"temperature": 0.6,
"top_p": 0.95,
}
result = llm.generate(prompt=input, sampling_params=sampling_params)
generated_text = result["text"] # Assume there is only one prompt
print_highlight("==== Original Output ====")
print_highlight(generated_text)
parser = ReasoningParser("deepseek-r1")
reasoning_text, text = parser.parse_non_stream(generated_text)
print_highlight("==== Reasoning ====")
print_highlight(reasoning_text)
print_highlight("==== Text ====")
print_highlight(text)
Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:03<00:03, 3.05s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:07<00:00, 3.69s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:07<00:00, 3.60s/it]
==== 原始输出 ===="
首先,我认识到问题要求计算 1 和 3 的和。
接着,我通过组合这两个数字来执行加法运算。
最后,我得到结果,即 4。
好的!让我们一步一步解决加法问题。
**问题:**
\(1 + 3\) 是多少?
**解决方案:**
1. **从第一个数字开始:**
\(1\)
2. **加上第二个数字:**
\(1 + 3\)
3. **计算总和:**
\(1 + 3 = 4\)
**最终答案:**
\(\boxed{4}\)
接着,我通过组合这两个数字来执行加法运算。
最后,我得到结果,即 4。
好的!让我们一步一步解决加法问题。
**问题:**
\(1 + 3\) 是多少?
**解决方案:**
1. **从第一个数字开始:**
\(1\)
2. **加上第二个数字:**
\(1 + 3\)
3. **计算总和:**
\(1 + 3 = 4\)
**最终答案:**
\(\boxed{4}\)
==== 推理 ===="
首先,我认识到问题要求计算 1 和 3 的和。
接着,我通过组合这两个数字来执行加法运算。
最后,我得到结果,即 4。
接着,我通过组合这两个数字来执行加法运算。
最后,我得到结果,即 4。
==== 文本 ===="
好的!让我们一步一步解决加法问题。
**问题:**
\(1 + 3\) 是多少?
**解决方案:**
1. **从第一个数字开始:**
\(1\)
2. **加上第二个数字:**
\(1 + 3\)
3. **计算总和:**
\(1 + 3 = 4\)
**最终答案:**
\(\boxed{4}\)
**问题:**
\(1 + 3\) 是多少?
**解决方案:**
1. **从第一个数字开始:**
\(1\)
2. **加上第二个数字:**
\(1 + 3\)
3. **计算总和:**
\(1 + 3 = 4\)
**最终答案:**
\(\boxed{4}\)
[10]:
llm.shutdown()
支持新的推理模型 Schema#
对于未来的推理模型,您可以在 python/sglang/srt/reasoning_parser.py
中将推理解析器实现为 BaseReasoningFormatDetector
的子类,并相应地为新的推理模型 Schema 指定推理解析器。
class DeepSeekR1Detector(BaseReasoningFormatDetector):
"""
Detector for DeepSeek-R1 model.
Assumes reasoning format:
(<think>)*(.*)</think>
Returns all the text before the </think> tag as `reasoning_text`
and the rest of the text as `normal_text`.
Args:
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
If True, streams reasoning content as it arrives.
"""
def __init__(self, stream_reasoning: bool = False):
# DeepSeek-R1 is assumed to be reasoning until `</think>` token
super().__init__("<think>", "</think>", True, stream_reasoning=stream_reasoning)
# https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
class ReasoningParser:
"""
Parser that handles both streaming and non-streaming scenarios for extracting
reasoning content from model outputs.
Args:
model_type (str): Type of model to parse reasoning from
stream_reasoning (bool): If False, accumulates reasoning content until complete.
If True, streams reasoning content as it arrives.
"""
DetectorMap: Dict[str, BaseReasoningFormatDetector] = {
"deepseek-r1": DeepSeekR1Detector
}
def __init__(self, model_type: str = None, stream_reasoning: bool = True):
if not model_type:
raise ValueError("Model type must be specified")
detector_class = self.DetectorMap.get(model_type.lower())
if not detector_class:
raise ValueError(f"Unsupported model type: {model_type}")
self.detector = detector_class(stream_reasoning=stream_reasoning)
def parse_non_stream(self, full_text: str) -> StreamingParseResult:
"""Non-streaming call: one-time parsing"""
ret = self.detector.detect_and_parse(full_text)
return ret.reasoning_text, ret.normal_text
def parse_stream_chunk(self, chunk_text: str) -> StreamingParseResult:
"""Streaming call: incremental parsing"""
ret = self.detector.parse_streaming_increment(chunk_text)
return ret.reasoning_text, ret.normal_text