Windows 環境下 llama.cpp 編譯 + Qwen 模型本地部署全指南詳情 - AI lyshark 博客

在大模型落地場景中，本地輕量化部署因低延遲、高隱私性、無需依賴雲端算力等優勢，成為開發者與 AI 愛好者的熱門需求。本文聚焦 Windows 10/11（64 位）環境，詳細拆解 llama.cpp 工具的編譯流程（支持 CPU/GPU 雙模式，GPU 加速需依賴 NVIDIA CUDA），並指導如何通過 modelscope 下載 GGUF 格式的 Qwen-7B-Chat 模型，最終實現模型本地啓動與 API 服務搭建。

1.打開管理員權限的 PowerShell/CMD，執行以下命令克隆代碼：

git clone https://github.com/ggml-org/llama.cpp
mkdir build
cd build

2.基礎編譯（僅 CPU 支持）或者選用GPU 加速編譯（已安裝 CUDA Toolkit）

如果只使用CPU則執行如下配置

cmake .. -G "Visual Studio 18 2026" -A x64 -DLLAMA_CURL=OFF
cmake --build . --config Release

如果已安裝 CUDA Toolkit，添加 -DLLAMA_CUDA=ON 開啓 GPU 支持

cmake .. -G "Visual Studio 18 2026" -A x64 -DLLAMA_CUDA=ON
cmake --build . --config Release

3、下載 GGUF 格式的 Qwen 模型（以 7B 為例）

https://www.modelscope.cn/models

pip install modelscope
modelscope download --model Xorbits/Qwen-7B-Chat-GGUF

下載後的保存位置為 \modelscope\hub\models\Xorbits

4、運行模型啓動 API 服務（支持 HTTP 調用）

# 命令行啓動
chcp 65001
llama-cli.exe -m qwen.gguf -i -c 4096

# CPU 版
llama-server.exe -m qwen.gguf --host 127.0.0.1 --port 11433 -c 4096

# GPU 加速版
llama-server.exe -m qwen-7b-chat.Q4_0.gguf -c 4096 --n-gpu-layers -1

5、服務啓動後默認監聽 http://localhost:8080，可通過 curl 測試調用效果。

curl http://localhost:8080/completion -H "Content-Type: application/json" -d '{
  "prompt": "你好，介紹一下通義千問",
  "temperature": 0.7,
  "max_tokens": 512
}'

6、工具測試，通過代碼調用大模型測試效果。

基礎非流式調用（completion 端點）

import requests
import json

url = "http://localhost:8080/completion"
headers = {"Content-Type": "application/json"}
data = {
    "model": "qwen.gguf",
    "prompt": "你好，請用100字介紹一下通義千問",
    "temperature": 0.7,  # 回答隨機性（越低越保守）
    "max_tokens": 512,  # 最大生成token數
    "ctx_size": 4096,  # 上下文窗口（與服務啓動時一致）
    "stop": ["<|im_end|>"]  # 停止符（適配Qwen的對話格式）
}

try:
    response = requests.post(url, headers=headers, data=json.dumps(data), timeout=60)
    response.raise_for_status()
    result = response.json()

    print("生成結果：")
    print(result["content"])
except Exception as e:
    print(f"調用失敗：{e}")

多輪對話示例（基於 chat/completions）

import requests
import json

chat_history = []
url = "http://localhost:8080/chat/completions"
headers = {"Content-Type": "application/json"}

def chat_with_model(prompt):
    # 添加當前用户消息到歷史
    chat_history.append({"role": "user", "content": prompt})

    data = {
        "model": "qwen.gguf",
        "messages": chat_history,
        "temperature": 0.7,
        "max_tokens": 512
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=60)
        response.raise_for_status()
        result = response.json()
        answer = result["choices"][0]["message"]["content"]

        # 添加助手回答到歷史
        chat_history.append({"role": "assistant", "content": answer})
        return answer
    except Exception as e:
        return f"調用失敗：{e}"

# 多輪對話示例
print("開始多輪對話（輸入'退出'結束）：")
while True:
    user_input = input("你：")
    if user_input == "退出":
        break
    answer = chat_with_model(user_input)
    print(f"助手：{answer}\n")

帶有對話記憶功能測試

import requests
import json
import re

# 初始化對話歷史（包含系統提示，引導模型記上下文）
chat_history = [
    {"role": "system", "content": "你是一個有幫助的助手，必須記住之前的對話內容，基於上下文回答用户問題。"}
]
# 你的服務實際地址（保持你原來的 11433 端口和 OpenAI 兼容路徑）
url = "http://localhost:11433/chat/completions"
headers = {"Content-Type": "application/json"}

def clean_pad_content(content):
    """過濾模型返回的 [PAD...] 垃圾字符"""
    return re.sub(r'\[PAD\d+\]', '', content).strip()

def chat_with_model(prompt):
    global chat_history

    # 添加當前用户消息到歷史（關鍵：上下文靠這個列表傳遞）
    chat_history.append({"role": "user", "content": prompt})

    data = {
        "model": "qwen.gguf",  # 保持你原來的模型名（你的服務識別這個名字）
        "messages": chat_history,  # 傳遞完整對話歷史
        "temperature": 0.7,
        "max_tokens": 512,
        "stream": False,  # 關閉流式輸出，適配你的返回格式
        "stop": ["[PAD"]  # 提前終止 PAD 字符的輸出
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=60)
        response.raise_for_status()  # 觸發 HTTP 錯誤（比如 404、500）

        result = response.json()
        print(f"調試：模型原始返回 = {json.dumps(result, ensure_ascii=False)[:500]}")  # 可選：查看原始返回

        # 適配你的 OpenAI 兼容格式：從 choices[0].message.content 提取內容
        if "choices" in result and len(result["choices"]) > 0:
            choice = result["choices"][0]
            if "message" in choice and "content" in choice["message"]:
                raw_answer = choice["message"]["content"]
                answer = clean_pad_content(raw_answer)  # 過濾 PAD 垃圾字符

                # 關鍵：將助手回覆加入歷史，下次請求會帶上
                chat_history.append({"role": "assistant", "content": answer})
                return answer
            else:
                return f"返回格式異常：缺少 message/content 字段，原始返回：{json.dumps(result, ensure_ascii=False)[:300]}"
        else:
            return f"返回格式異常：缺少 choices 字段，原始返回：{json.dumps(result, ensure_ascii=False)[:300]}"

    except requests.exceptions.ConnectionError:
        return "連接失敗：請檢查本地服務是否在 11433 端口運行"
    except requests.exceptions.Timeout:
        return "請求超時：模型響應過慢"
    except Exception as e:
        return f"調用失敗：{str(e)}，原始返回：{response.text[:300] if 'response' in locals() else '無'}"

# 多輪對話測試（重點測試上下文記憶）
print("開始多輪對話（輸入'退出'結束）：")
print("提示：先發送 '我的名字是李四'，再發送 '我叫什麼名字' 測試記憶功能\n")
while True:
    user_input = input("你：")
    if user_input.strip() == "退出":
        break
    if not user_input.strip():
        print("助手：請輸入有效內容！\n")
        continue
    answer = chat_with_model(user_input)
    print(f"助手：{answer}\n")

函數工具調用測試

import requests
import json
import re
from datetime import datetime

# ====================== 1. 定義可用工具集 ======================
# 工具1：獲取當前時間
def get_current_time():
    """獲取當前的本地時間，格式為 年-月-日 時:分:秒"""
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    return f"當前時間為：{current_time}"


# 工具2：加法計算
def calculate_add(a: float, b: float):
    """計算兩個數的加法結果"""
    return f"{a} + {b} = {a + b}"


# 工具註冊表（核心：映射工具名到函數和描述，供模型識別）
tool_registry = {
    "get_current_time": {
        "function": get_current_time,
        "description": "獲取當前的本地時間，無需參數",
        "parameters": {}  # 無參數
    },
    "calculate_add": {
        "function": calculate_add,
        "description": "計算兩個數字的加法，需要兩個參數：a（數字）、b（數字）",
        "parameters": {
            "a": {"type": "float", "required": True, "description": "加數1"},
            "b": {"type": "float", "required": True, "description": "加數2"}
        }
    }
}

# ====================== 2. 初始化對話歷史和基礎配置 ======================
chat_history = [
    {"role": "system", "content": """你是一個有幫助的助手，必須記住之前的對話內容，基於上下文回答用户問題。
你可以調用以下工具來輔助回答：
1. get_current_time：獲取當前的本地時間，無需參數
2. calculate_add：計算兩個數字的加法，需要參數a和b（均為數字）

如果需要調用工具，請嚴格按照以下JSON格式返回（僅返回JSON，不要加其他內容）：
{"name": "工具名", "parameters": {"參數名": 參數值}}

如果不需要調用工具，直接回答用户問題即可，不要返回JSON格式。"""}
]

# 本地LLM服務地址
url = "http://localhost:11433/chat/completions"
headers = {"Content-Type": "application/json"}


# ====================== 3. 工具調用相關輔助函數 ======================
def clean_pad_content(content):
    """過濾模型返回的 [PAD...] 垃圾字符"""
    return re.sub(r'\[PAD\d+\]', '', content).strip()


def parse_tool_call(content):
    """解析模型返回的內容，提取工具調用指令（JSON格式）"""
    try:
        # 提取JSON部分（兼容模型返回時可能帶的多餘文字）
        json_match = re.search(r'\{[\s\S]*\}', content)
        if not json_match:
            return None
        tool_call = json.loads(json_match.group())
        # 驗證必要字段
        if "name" in tool_call and "parameters" in tool_call:
            return tool_call
        return None
    except (json.JSONDecodeError, Exception):
        return None


def execute_tool(tool_call):
    """執行工具調用，返回執行結果"""
    tool_name = tool_call["name"]
    parameters = tool_call.get("parameters", {})

    # 檢查工具是否存在
    if tool_name not in tool_registry:
        return f"錯誤：不存在名為 {tool_name} 的工具，可用工具：{list(tool_registry.keys())}"

    tool_info = tool_registry[tool_name]
    tool_func = tool_info["function"]
    tool_params = tool_info["parameters"]

    # 驗證必填參數
    missing_params = []
    for param_name, param_info in tool_params.items():
        if param_info.get("required") and param_name not in parameters:
            missing_params.append(param_name)
    if missing_params:
        return f"錯誤：調用 {tool_name} 缺少必填參數：{', '.join(missing_params)}"

    # 轉換參數類型（比如字符串轉數字）
    try:
        for param_name, param_info in tool_params.items():
            if param_name in parameters:
                param_type = param_info.get("type", "str")
                if param_type == "float":
                    parameters[param_name] = float(parameters[param_name])
                elif param_type == "int":
                    parameters[param_name] = int(parameters[param_name])
    except ValueError as e:
        return f"錯誤：參數類型轉換失敗 - {str(e)}"

    # 執行工具函數
    try:
        result = tool_func(**parameters)
        return f"工具調用成功（{tool_name}）：{result}"
    except Exception as e:
        return f"錯誤：執行 {tool_name} 失敗 - {str(e)}"


# ====================== 4. 核心對話函數（支持工具調用） ======================
def chat_with_model(prompt):
    global chat_history

    # 添加當前用户消息到歷史
    chat_history.append({"role": "user", "content": prompt})

    # 第一步：發送請求，判斷是否需要調用工具
    data = {
        "model": "qwen.gguf",
        "messages": chat_history,
        "temperature": 0.7,
        "max_tokens": 512,
        "stream": False,
        "stop": ["[PAD"]
    }

    try:
        # 第一次調用模型：獲取是否需要工具調用的響應
        response = requests.post(url, headers=headers, data=json.dumps(data), timeout=60)
        response.raise_for_status()
        result = response.json()

        # 解析模型原始返回
        if "choices" in result and len(result["choices"]) > 0 and "message" in result["choices"][0]:
            raw_answer = result["choices"][0]["message"]["content"]
            clean_answer = clean_pad_content(raw_answer)
        else:
            return f"返回格式異常：{json.dumps(result, ensure_ascii=False)[:300]}"

        # 解析是否包含工具調用指令
        tool_call = parse_tool_call(clean_answer)
        if tool_call:
            print(f"📢 檢測到工具調用：{json.dumps(tool_call, ensure_ascii=False)}")

            # 執行工具並獲取結果
            tool_result = execute_tool(tool_call)
            print(f"🔧 工具執行結果：{tool_result}")

            # 將工具執行結果加入對話歷史（讓模型感知結果）
            chat_history.append({
                "role": "assistant",
                "content": f"工具調用結果：{tool_result}"
            })

            # 第二步：基於工具結果，再次調用模型生成最終回答
            second_response = requests.post(url, headers=headers, data=json.dumps(data), timeout=60)
            second_response.raise_for_status()
            second_result = second_response.json()

            # 解析第二次調用的結果
            if "choices" in second_result and len(second_result["choices"]) > 0 and "message" in \
                    second_result["choices"][0]:
                final_answer = clean_pad_content(second_result["choices"][0]["message"]["content"])
                chat_history.append({"role": "assistant", "content": final_answer})
                return final_answer
            else:
                return f"工具調用後二次請求異常：{json.dumps(second_result, ensure_ascii=False)[:300]}"
        else:
            # 無需調用工具，直接返回模型回答
            chat_history.append({"role": "assistant", "content": clean_answer})
            return clean_answer

    except requests.exceptions.ConnectionError:
        return "連接失敗：請檢查本地服務是否在 11433 端口運行"
    except requests.exceptions.Timeout:
        return "請求超時：模型響應過慢"
    except Exception as e:
        return f"調用失敗：{str(e)}，原始返回：{response.text[:300] if 'response' in locals() else '無'}"


# ====================== 5. 多輪對話測試（含工具調用） ======================
if __name__ == "__main__":
    print("開始多輪對話（輸入'退出'結束）：")
    print("📌 測試工具調用示例：")
    print("   1. 現在幾點了？（調用獲取時間工具）")
    print("   2. 計算123+456等於多少？（調用加法工具）")
    print("   3. 我的名字是李四，我叫什麼？（測試上下文記憶）\n")

    while True:
        user_input = input("你：")
        if user_input.strip() == "退出":
            break
        if not user_input.strip():
            print("助手：請輸入有效內容！\n")
            continue
        answer = chat_with_model(user_input)
        print(f"助手：{answer}\n")

lyshark 博客

lyshark 博客

博客 / 詳情

Windows 環境下 llama.cpp 編譯 + Qwen 模型本地部署全指南

發佈評論

Product

Company

Support

Company

博客 / 詳情

Windows 環境下 llama.cpp 編譯 + Qwen 模型本地部署全指南

發佈 評論

發佈評論