Qwen3-32B 昇腾 NPU 环境 MindIE + vLLM 双引擎部署验证笔记-编程阁

一、用户背景与核心问题

1. 场景：昇腾Duo卡跑Qwen3-32B大模型，先后尝试MindIE与vLLM两种部署方式。

2. 客户问题：MindIE部署后，curl回答中文输出出现`Ã`乱码。

3. 关键线索：乱码仅出现在MindIE+训练好的Jinja对话模板场景中。

二、乱码根源

1. 乱码本质 `Ã`是UTF-8中文被错误按Latin-1/ISO-8859-1解析，再二次打包为UTF-8的典型结果，属于编码不匹配错误。

二、MindIE 部署方案与验证

1. 进入部署环境

启动进入qwen32b容器

进入 MindIE 服务工作目录

cd /usr/local/Ascend/mindie/latest/mindie-service/

2. 检查系统语言环境

查看系统已安装 / 可用语言环境，确认编码基础配置

locale -a locale

3. 编写环境变量脚本 env.sh

创建并配置环境变量，解决中文乱码与 NPU 运行参数优化

# 创建脚本 vim env.sh # 脚本内容如下 export LC_ALL=C.utf8 export LANG=C.utf8 export NPU_MEMORY_FRACTION=0.97 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:tre export HCLL_OP_EXPANSION_MODE="AIV" export HCCL_RDMA_PCIE_DIRECT_POST_NOSTRICT=true export TASK_QUEUE_ENABLE=2 export ATB_WORKSPACE_MEN_ALLOC_ALG_TYPE=2 export CPU_AFFINITY_CONF=2

4. 修改配置文件 config.json

修改模型配置，指定对话模板路径，修复中文乱码核心配置

# 执行vim 修改文件命令 vim conf/config.json # 在BackendConfig → ModelDeployConfig → ModelConfig 新增 # “chat_templage”:"/home/jy/qwen32b/chat_template.jinjia", { "Version" : "1.0.0", "ServerConfig" : { "ipAddress" : "127.0.0.1", "managementIpAddress" : "127.0.0.2", "port" : 1025, "managementPort" : 1026, "metricsPort" : 1027, "allowAllZeroIpListening" : true, "maxLinkNum" : 1000, "httpsEnabled" : false, "fullTextEnabled" : false, "tlsCaPath" : "security/ca/", "tlsCaFile" : ["ca.pem"], "tlsCert" : "security/certs/server.pem", "tlsPk" : "security/keys/server.key.pem", "tlsPkPwd" : "security/pass/key_pwd.txt", "tlsCrlPath" : "security/certs/", "tlsCrlFiles" : ["server_crl.pem"], "managementTlsCaFile" : ["management_ca.pem"], "managementTlsCert" : "security/certs/management/server.pem", "managementTlsPk" : "security/keys/management/server.key.pem", "managementTlsPkPwd" : "security/pass/management/key_pwd.txt", "managementTlsCrlPath" : "security/management/certs/", "managementTlsCrlFiles" : ["server_crl.pem"], "kmcKsfMaster" : "tools/pmt/master/ksfa", "kmcKsfStandby" : "tools/pmt/standby/ksfb", "inferMode" : "standard", "interCommTLSEnabled" : false, "interCommPort" : 1121, "interCommTlsCaPath" : "security/grpc/ca/", "interCommTlsCaFiles" : ["ca.pem"], "interCommTlsCert" : "security/grpc/certs/server.pem", "interCommPk" : "security/grpc/keys/server.key.pem", "interCommPkPwd" : "security/grpc/pass/key_pwd.txt", "interCommTlsCrlPath" : "security/grpc/certs/", "interCommTlsCrlFiles" : ["server_crl.pem"], "openAiSupport" : "vllm", "tokenTimeout" : 600, "e2eTimeout" : 600, “chat_templage”:"/home/jy/qwen32b/chat_template.jinjia", "distDPServerEnabled":false }, "BackendConfig" : { "backendName" : "mindieservice_llm_engine", "modelInstanceNumber" : 1, "npuDeviceIds" : [[0,1,2,3,4,5,6,7]], "tokenizerProcessNumber" : 8, "multiNodesInferEnabled" : false, "multiNodesInferPort" : 1120, "interNodeTLSEnabled" : true, "interNodeTlsCaPath" : "security/grpc/ca/", "interNodeTlsCaFiles" : ["ca.pem"], "interNodeTlsCert" : "security/grpc/certs/server.pem", "interNodeTlsPk" : "security/grpc/keys/server.key.pem", "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt", "interNodeTlsCrlPath" : "security/grpc/certs/", "interNodeTlsCrlFiles" : ["server_crl.pem"], "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa", "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb", "ModelDeployConfig" : { "maxSeqLen" : 16384, "maxInputTokenLen" : 8192, "truncation" : false, "ModelConfig" : [ { "modelInstanceType" : "Standard", "modelName" : "Qwen2.5-3B-Instruct", "modelWeightPath" : "/data//Qwen2.5-3B-Instruct", "worldSize" : 8, "cpuMemSize" : 5, "npuMemSize" : -1, "backendType" : "atb", "trustRemoteCode" : false, "async_scheduler_wait_time": 120, "kv_trans_timeout": 10, "kv_link_timeout": 1080 } ] }, "ScheduleConfig" : { "templateType" : "Standard", "templateName" : "Standard_LLM", "cacheBlockSize" : 128, "maxPrefillBatchSize" : 50, "maxPrefillTokens" : 8192, "prefillTimeMsPerReq" : 150, "prefillPolicyType" : 0, "decodeTimeMsPerReq" : 50, "decodePolicyType" : 0, "maxBatchSize" : 200, "maxIterTimes" : 8192, "maxPreemptCount" : 0, "supportSelectBatch" : false, "maxQueueDelayMicroseconds" : 5000 } } } { "Version" : "1.0.0", "ServerConfig" : { "ipAddress" : "127.0.0.1", "managementIpAddress" : "127.0.0.2", "port" : 1025, "managementPort" : 1026, "metricsPort" : 1027, "allowAllZeroIpListening" : true, "maxLinkNum" : 1000, "httpsEnabled" : false, "fullTextEnabled" : false, "tlsCaPath" : "security/ca/", "tlsCaFile" : ["ca.pem"], "tlsCert" : "security/certs/server.pem", "tlsPk" : "security/keys/server.key.pem", "tlsPkPwd" : "security/pass/key_pwd.txt", "tlsCrlPath" : "security/certs/", "tlsCrlFiles" : ["server_crl.pem"], "managementTlsCaFile" : ["management_ca.pem"], "managementTlsCert" : "security/certs/management/server.pem", "managementTlsPk" : "security/keys/management/server.key.pem", "managementTlsPkPwd" : "security/pass/management/key_pwd.txt", "managementTlsCrlPath" : "security/management/certs/", "managementTlsCrlFiles" : ["server_crl.pem"], "kmcKsfMaster" : "tools/pmt/master/ksfa", "kmcKsfStandby" : "tools/pmt/standby/ksfb", "inferMode" : "standard", "interCommTLSEnabled" : false, "interCommPort" : 1121, "interCommTlsCaPath" : "security/grpc/ca/", "interCommTlsCaFiles" : ["ca.pem"], "interCommTlsCert" : "security/grpc/certs/server.pem", "interCommPk" : "security/grpc/keys/server.key.pem", "interCommPkPwd" : "security/grpc/pass/key_pwd.txt", "interCommTlsCrlPath" : "security/grpc/certs/", "interCommTlsCrlFiles" : ["server_crl.pem"], "openAiSupport" : "vllm", "tokenTimeout" : 600, "e2eTimeout" : 600, “chat_templage”:"/home/jy/qwen32b/chat_template.jinjia", "distDPServerEnabled":false }, "BackendConfig" : { "backendName" : "mindieservice_llm_engine", "modelInstanceNumber" : 1, "npuDeviceIds" : [[0,1,2,3,4,5,6,7]], "tokenizerProcessNumber" : 8, "multiNodesInferEnabled" : false, "multiNodesInferPort" : 1120, "interNodeTLSEnabled" : true, "interNodeTlsCaPath" : "security/grpc/ca/", "interNodeTlsCaFiles" : ["ca.pem"], "interNodeTlsCert" : "security/grpc/certs/server.pem", "interNodeTlsPk" : "security/grpc/keys/server.key.pem", "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt", "interNodeTlsCrlPath" : "security/grpc/certs/", "interNodeTlsCrlFiles" : ["server_crl.pem"], "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa", "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb", "ModelDeployConfig" : { "maxSeqLen" : 16384, "maxInputTokenLen" : 8192, "truncation" : false, "ModelConfig" : [ { "modelInstanceType" : "Standard", "modelName" : "Qwen2.5-3B-Instruct", "modelWeightPath" : "/data//Qwen2.5-3B-Instruct", "worldSize" : 8, "cpuMemSize" : 5, "npuMemSize" : -1, "backendType" : "atb", "trustRemoteCode" : false, "async_scheduler_wait_time": 120, "kv_trans_timeout": 10, "kv_link_timeout": 1080 } ] }, "ScheduleConfig" : { "templateType" : "Standard", "templateName" : "Standard_LLM", "cacheBlockSize" : 128, "maxPrefillBatchSize" : 50, "maxPrefillTokens" : 8192, "prefillTimeMsPerReq" : 150, "prefillPolicyType" : 0, "decodeTimeMsPerReq" : 50, "decodePolicyType" : 0, "maxBatchSize" : 200, "maxIterTimes" : 8192, "maxPreemptCount" : 0, "supportSelectBatch" : false, "maxQueueDelayMicroseconds" : 5000 } } }

wq 保存退出，运行当前目录下 bin 文件夹里的 mindieservice 后台服务程序

5. 启动 MindIE 服务

保存配置后，启动后台服务程序

./bin/mindieservice_daemon

6. 接口验证

使用 curl 调用接口验证服务可用性

curl -i http://172.27.1.25:11011/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "qwen3-32b", "messages": [{"role": "user", "content": "你好，请介绍一下自己"}], "stream": false }'

7.验证结果

MindIE 服务启动成功，接口调用正常，中文输出无乱码，推理响应正常。

三、VLLM 部署方案与验证

1. 编写启动脚本 vllm.sh

配置昇腾 NPU 优化参数与 vLLM 启动参数

# 昇腾NPU优化环境变量 export HCCL_OP_EXPANSION_MODE="AIV" export TASK_QUEUE_ENABLE=1 export HCCL_BUFFSIZE=512 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export VLLM_USE_V1=1 # 启动vLLM服务 ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 vllm serve /home/jy/qwen32b \ --host 0.0.0.0 \ --port 11011 \ --served-model-name qwen3-32b \ --tensor-parallel-size 8 \ --dtype float16 \ --max-model-len 8192 \ --trust-remote-code \ --max-num-batched-tokens 8192 \ --gpu-memory-utilization 0.85 \ --no-enable-prefix-caching \ --distributed_executor_backend "mp" \ --enable-chunked-prefill \ --max-num-seqs 32 \ --enforce-eager \ --async-sheduling \ --additional_config '{"enable_cpu_binding":true}' \ --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'

2. 执行启动脚本

bash vllm.sh

3. 接口验证

使用 curl 调用接口验证服务可用性

curl -i http://172.27.1.25:11011/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "qwen3-32b", "messages": [{"role": "user", "content": "你好，请介绍一下自己"}], "stream": false }'