nix/hosts/llama-server.nix

150 lines
3.2 KiB
Nix
Raw Normal View History

2025-01-25 00:53:44 +00:00
{ config, pkgs, ... }:
let
cuda-llama = (pkgs.llama-cpp.override {
cudaSupport = true;
}).overrideAttrs (oldAttrs: {
cmakeFlags = oldAttrs.cmakeFlags ++ [
2025-01-28 01:22:18 +00:00
"-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
2025-01-25 00:53:44 +00:00
# Disable CPU Instructions - Intel(R) Core(TM) i5-3570K CPU @ 3.40GHz
"-DLLAMA_FMA=OFF"
"-DLLAMA_AVX2=OFF"
"-DLLAMA_AVX512=OFF"
"-DGGML_FMA=OFF"
"-DGGML_AVX2=OFF"
"-DGGML_AVX512=OFF"
];
});
# Define Model Vars
modelDir = "/models";
2025-01-28 01:22:18 +00:00
# 7B
# modelName = "qwen2.5-coder-7b-q8_0.gguf";
# modelUrl = "https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF/resolve/main/${modelName}?download=true";
# 3B
modelName = "qwen2.5-coder-3b-q8_0.gguf";
modelUrl = "https://huggingface.co/ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF/resolve/main/${modelName}?download=true";
2025-01-25 00:53:44 +00:00
modelPath = "${modelDir}/${modelName}";
in
{
# Allow Nvidia & CUDA
nixpkgs.config.allowUnfree = true;
# Enable Graphics
hardware.graphics = {
enable = true;
enable32Bit = true;
extraPackages = [ pkgs.cudatoolkit ];
};
# Load Nvidia Driver Module
services.xserver.videoDrivers = [ "nvidia" ];
# Nvidia Package Configuration
hardware.nvidia = {
package = config.boot.kernelPackages.nvidiaPackages.stable;
modesetting.enable = true;
powerManagement.enable = true;
open = false;
nvidiaSettings = true;
};
# Network Configuration
networking.networkmanager.enable = true;
# Download Model
systemd.services.download-model = {
description = "Download Model";
wantedBy = [ "multi-user.target" ];
before = [ "llama-cpp.service" ];
path = [ pkgs.curl pkgs.coreutils ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
User = "root";
Group = "root";
};
script = ''
set -euo pipefail
if [ ! -f "${modelPath}" ]; then
mkdir -p "${modelDir}"
# Add -f flag to follow redirects and -L for location
# Add --fail flag to exit with error on HTTP errors
# Add -C - to resume interrupted downloads
curl -f -L -C - \
-H "Accept: application/octet-stream" \
--retry 3 \
--retry-delay 5 \
--max-time 1800 \
"${modelUrl}" \
-o "${modelPath}.tmp" && \
mv "${modelPath}.tmp" "${modelPath}"
fi
'';
};
# Setup LLama API Service
systemd.services.llama-cpp = {
after = [ "download-model.service" ];
requires = [ "download-model.service" ];
};
# Enable LLama API
services.llama-cpp = {
enable = true;
host = "0.0.0.0";
package = cuda-llama;
model = modelPath;
port = 8080;
openFirewall = true;
2025-01-28 01:22:18 +00:00
# 7B
# extraFlags = [
# "-ngl"
# "99"
# "-fa"
# "-ub"
# "512"
# "-b"
# "512"
# "-dt"
# "0.1"
# "--ctx-size"
# "4096"
# "--cache-reuse"
# "256"
# ];
# 3B
2025-01-25 00:53:44 +00:00
extraFlags = [
"-ngl"
"99"
"-fa"
"-ub"
2025-01-28 01:22:18 +00:00
"1024"
2025-01-25 00:53:44 +00:00
"-b"
2025-01-28 01:22:18 +00:00
"1024"
2025-01-25 00:53:44 +00:00
"--ctx-size"
2025-01-28 01:22:18 +00:00
"0"
2025-01-25 00:53:44 +00:00
"--cache-reuse"
"256"
];
};
# System Packages
environment.systemPackages = with pkgs; [
htop
nvtop
tmux
vim
wget
];
}