This commit is contained in:
Evan Reichard 2025-01-27 20:21:43 -05:00
parent dbb1183018
commit 66a47b1338
9 changed files with 654 additions and 47 deletions

View File

@ -13,6 +13,7 @@ scp -r * root@10.10.10.10:/etc/nixos
ls -l /dev/disk/by-id
# Partition Disk
# WARNING: This will destroy all data on the disk(s)
sudo nix \
--experimental-features "nix-command flakes" \
run github:nix-community/disko -- \
@ -33,7 +34,7 @@ sudo reboot
## Copy Config Back to Host
```bash
scp -r * nixos@10.10.10.10:/etc/nixos
scp -r * nixos@10.0.20.201:/etc/nixos
```
## Rebuild NixOS
@ -52,10 +53,14 @@ sudo nixos-install --flake /etc/nixos#lin-va-rke1
cat /var/lib/rancher/rke2/server/node-token
# Deploy Following Nodes
echo "<TOKEN>" > ./k8s/rke2-token
echo "<TOKEN>" > rke2-token
sudo nixos-install --flake /etc/nixos#lin-va-rke2
```
## TODO
## Notes
OpenEBS DiskPool Configuration not being applied. Likely need to consolidate RKE2 config, generate DiskPool config in complete, then apply.
## Kasten Port Forward
```bash
kubectl port-forward -n kasten svc/gateway 8000:80
```

View File

@ -26,7 +26,7 @@
systemConfig = ./hosts/llama-server.nix;
moduleConfig = {
hostName = "lin-va-llama1";
mainDiskID = "/dev/sda";
mainDiskID = "/dev/disk/by-id/ata-MTFDDAK512MBF-1AN1ZABHA_161212233628";
};
};

View File

@ -5,7 +5,7 @@ let
cudaSupport = true;
}).overrideAttrs (oldAttrs: {
cmakeFlags = oldAttrs.cmakeFlags ++ [
"-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=ON"
"-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
# Disable CPU Instructions - Intel(R) Core(TM) i5-3570K CPU @ 3.40GHz
"-DLLAMA_FMA=OFF"
@ -19,9 +19,16 @@ let
# Define Model Vars
modelDir = "/models";
modelName = "qwen2.5-coder-7b-q8_0.gguf";
# 7B
# modelName = "qwen2.5-coder-7b-q8_0.gguf";
# modelUrl = "https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF/resolve/main/${modelName}?download=true";
# 3B
modelName = "qwen2.5-coder-3b-q8_0.gguf";
modelUrl = "https://huggingface.co/ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF/resolve/main/${modelName}?download=true";
modelPath = "${modelDir}/${modelName}";
modelUrl = "https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF/resolve/main/${modelName}?download=true";
in
{
@ -97,18 +104,35 @@ in
model = modelPath;
port = 8080;
openFirewall = true;
# 7B
# extraFlags = [
# "-ngl"
# "99"
# "-fa"
# "-ub"
# "512"
# "-b"
# "512"
# "-dt"
# "0.1"
# "--ctx-size"
# "4096"
# "--cache-reuse"
# "256"
# ];
# 3B
extraFlags = [
"-ngl"
"99"
"-fa"
"-ub"
"512"
"1024"
"-b"
"512"
"-dt"
"0.1"
"1024"
"--ctx-size"
"4096"
"0"
"--cache-reuse"
"256"
];

147
hosts/rke2-ceph.nix Normal file
View File

@ -0,0 +1,147 @@
{ config, pkgs, lib, ... }:
{
# Node Nix Config
options = {
dataDiskID = lib.mkOption {
type = lib.types.str;
description = "The device ID for the data disk";
};
serverAddr = lib.mkOption {
type = lib.types.str;
description = "The server to join";
default = "";
};
networkConfig = lib.mkOption {
type = lib.types.submodule {
options = {
interface = lib.mkOption {
type = lib.types.str;
description = "Network interface name";
example = "enp0s3";
};
address = lib.mkOption {
type = lib.types.str;
description = "Static IP address";
example = "10.0.20.200";
};
defaultGateway = lib.mkOption {
type = lib.types.str;
description = "Default gateway IP";
example = "10.0.20.254";
};
nameservers = lib.mkOption {
type = lib.types.listOf lib.types.str;
description = "List of DNS servers";
example = [ "10.0.20.254" "8.8.8.8" ];
default = [ "8.8.8.8" "8.8.4.4" ];
};
};
};
description = "Network configuration";
};
};
config = {
# ----------------------------------------
# ---------- Base Configuration ----------
# ----------------------------------------
# Ceph Requirements
boot.kernelModules = [ "rbd" ];
# Network Configuration
networking = {
hostName = config.hostName;
networkmanager.enable = false;
# Interface Configuration
inherit (config.networkConfig) defaultGateway nameservers;
interfaces.${config.networkConfig.interface}.ipv4.addresses = [{
inherit (config.networkConfig) address;
prefixLength = 24;
}];
firewall = {
enable = true;
allowedTCPPorts = [
# RKE2 Ports - https://docs.rke2.io/install/requirements#networking
6443 # Kubernetes API
9345 # RKE2 supervisor API
2379 # etcd Client Port
2380 # etcd Peer Port
2381 # etcd Metrics Port
10250 # kubelet metrics
9099 # Canal CNI health checks
# Ceph Ports
3300 # Ceph MON daemon
6789 # Ceph MON service
] ++ lib.range 6800 7300; # Ceph OSD range
allowedUDPPorts = [
# RKE2 Ports - https://docs.rke2.io/install/requirements#networking
8472 # Canal CNI with VXLAN
# 51820 # Canal CNI with WireGuard IPv4 (if using encryption)
# 51821 # Canal CNI with WireGuard IPv6 (if using encryption)
];
};
};
# System Packages
environment.systemPackages = with pkgs; [
htop
k9s
kubectl
kubernetes-helm
nfs-utils
tmux
vim
];
# ----------------------------------------
# ---------- RKE2 Configuration ----------
# ----------------------------------------
# RKE2 Join Token
environment.etc."rancher/rke2/node-token" = lib.mkIf (config.serverAddr != "") {
source = ../rke2-token;
mode = "0600";
user = "root";
group = "root";
};
# Enable RKE2
services.rke2 = {
enable = true;
role = "server";
disable = [
# Disable - Utilizing Traefik
"rke2-ingress-nginx"
# Distable - Utilizing OpenEBS's Snapshot Controller
"rke2-snapshot-controller"
"rke2-snapshot-controller-crd"
"rke2-snapshot-validation-webhook"
];
} // lib.optionalAttrs (config.serverAddr != "") {
serverAddr = config.serverAddr;
tokenFile = "/etc/rancher/rke2/node-token";
};
# Bootstrap Kubernetes Manifests
system.activationScripts.k8s-manifests = {
deps = [ ];
text = ''
mkdir -p /var/lib/rancher/rke2/server/manifests
# Base Configs
cp ${../k8s/ceph.yaml} /var/lib/rancher/rke2/server/manifests/ceph-base.yaml
cp ${../k8s/kasten.yaml} /var/lib/rancher/rke2/server/manifests/kasten-base.yaml
'';
};
};
}

162
hosts/rke2-openebs.nix Normal file
View File

@ -0,0 +1,162 @@
{ config, pkgs, lib, ... }:
{
# Node Nix Config
options = {
dataDiskID = lib.mkOption {
type = lib.types.str;
description = "The device ID for the data disk";
};
serverAddr = lib.mkOption {
type = lib.types.str;
description = "The server to join";
default = "";
};
networkConfig = lib.mkOption {
type = lib.types.submodule {
options = {
interface = lib.mkOption {
type = lib.types.str;
description = "Network interface name";
example = "enp0s3";
};
address = lib.mkOption {
type = lib.types.str;
description = "Static IP address";
example = "10.0.20.200";
};
defaultGateway = lib.mkOption {
type = lib.types.str;
description = "Default gateway IP";
example = "10.0.20.254";
};
nameservers = lib.mkOption {
type = lib.types.listOf lib.types.str;
description = "List of DNS servers";
example = [ "10.0.20.254" "8.8.8.8" ];
default = [ "8.8.8.8" "8.8.4.4" ];
};
};
};
description = "Network configuration";
};
};
config = {
# ----------------------------------------
# ---------- Base Configuration ----------
# ----------------------------------------
# OpenEBS Mayastor Requirements
boot.kernelModules = [ "nvme_tcp" ];
boot.kernel.sysctl = {
"vm.nr_hugepages" = 1024;
};
# Network Configuration
networking = {
hostName = config.hostName;
networkmanager.enable = false;
# Interface Configuration
inherit (config.networkConfig) defaultGateway nameservers;
interfaces.${config.networkConfig.interface}.ipv4.addresses = [{
inherit (config.networkConfig) address;
prefixLength = 24;
}];
firewall = {
enable = true;
allowedTCPPorts = [
# RKE2 Ports - https://docs.rke2.io/install/requirements#networking
6443 # Kubernetes API
9345 # RKE2 supervisor API
2379 # etcd Client Port
2380 # etcd Peer Port
2381 # etcd Metrics Port
10250 # kubelet metrics
9099 # Canal CNI health checks
# OpenEBS Mayastor - https://openebs.io/docs/user-guides/replicated-storage-user-guide/replicated-pv-mayastor/rs-installation#network-requirements
10124 # REST API
8420 # NVMf
4421 # NVMf
];
allowedUDPPorts = [
# RKE2 Ports - https://docs.rke2.io/install/requirements#networking
8472 # Canal CNI with VXLAN
# 51820 # Canal CNI with WireGuard IPv4 (if using encryption)
# 51821 # Canal CNI with WireGuard IPv6 (if using encryption)
];
};
};
# System Packages
environment.systemPackages = with pkgs; [
htop
k9s
kubectl
kubernetes-helm
nfs-utils
vim
];
# ----------------------------------------
# ---------- RKE2 Configuration ----------
# ----------------------------------------
# RKE2 Join Token
environment.etc."rancher/rke2/node-token" = lib.mkIf (config.serverAddr != "") {
source = ../rke2-token;
mode = "0600";
user = "root";
group = "root";
};
# Enable RKE2
services.rke2 = {
enable = true;
role = "server";
disable = [
# Disable - Utilizing Traefik
"rke2-ingress-nginx"
# Distable - Utilizing OpenEBS's Snapshot Controller
"rke2-snapshot-controller"
"rke2-snapshot-controller-crd"
"rke2-snapshot-validation-webhook"
];
# OpenEBS Scheduleable
nodeLabel = [
"openebs.io/engine=mayastor"
];
} // lib.optionalAttrs (config.serverAddr != "") {
serverAddr = config.serverAddr;
tokenFile = "/etc/rancher/rke2/node-token";
};
# Bootstrap Kubernetes Manifests
system.activationScripts.k8s-manifests = {
deps = [ ];
text = ''
mkdir -p /var/lib/rancher/rke2/server/manifests
# Base Configs
cp ${../k8s/openebs.yaml} /var/lib/rancher/rke2/server/manifests/openebs-base.yaml
cp ${../k8s/kasten.yaml} /var/lib/rancher/rke2/server/manifests/kasten-base.yaml
# OpenEBS Disk Pool
cp ${pkgs.substituteAll {
src = ../k8s/openebs-disk-pool.yaml;
hostName = config.hostName;
dataDiskID = config.dataDiskID;
}} /var/lib/rancher/rke2/server/manifests/openebs-disk-pool-${config.hostName}.yaml
'';
};
};
}

View File

@ -47,10 +47,33 @@
# ---------- Base Configuration ----------
# ----------------------------------------
# OpenEBS Mayastor Requirements
boot.kernelModules = [ "nvme_tcp" ];
boot.kernel.sysctl = {
"vm.nr_hugepages" = 1024;
# Longhorn Requirements
boot.kernelModules = [
"iscsi_tcp"
"dm_crypt"
];
# Longhorn Data Disk
disko.devices = {
disk.longhorn = {
type = "disk";
device = config.dataDiskID;
content = {
type = "gpt";
partitions = {
longhorn = {
size = "100%";
content = {
type = "filesystem";
format = "xfs";
mountpoint = "/storage/longhorn";
mountOptions = [ "defaults" "nofail" ];
extraArgs = [ "-d" "su=128k,sw=8" ];
};
};
};
};
};
};
# Network Configuration
@ -78,10 +101,8 @@
10250 # kubelet metrics
9099 # Canal CNI health checks
# OpenEBS Mayastor - https://openebs.io/docs/user-guides/replicated-storage-user-guide/replicated-pv-mayastor/rs-installation#network-requirements
10124 # REST API
8420 # NVMf
4421 # NVMf
# iSCSI Port
3260
];
allowedUDPPorts = [
@ -100,6 +121,8 @@
kubectl
kubernetes-helm
nfs-utils
openiscsi
tmux
vim
];
@ -124,39 +147,39 @@
# Disable - Utilizing Traefik
"rke2-ingress-nginx"
# Distable - Utilizing OpenEBS's Snapshot Controller
# Disable - Utilizing Longhorn's Snapshot Controller
"rke2-snapshot-controller"
"rke2-snapshot-controller-crd"
"rke2-snapshot-validation-webhook"
];
# OpenEBS Scheduleable
nodeLabel = [
"openebs.io/engine=mayastor"
];
} // lib.optionalAttrs (config.serverAddr != "") {
serverAddr = config.serverAddr;
tokenFile = "/etc/rancher/rke2/node-token";
};
# Enable OpeniSCSI
services.openiscsi = {
enable = true;
name = "iqn.2025-01.${config.hostName}:initiator";
};
# Bootstrap Kubernetes Manifests
# system.activationScripts.k8s-manifests = {
# deps = [ ];
# text = ''
# mkdir -p /var/lib/rancher/rke2/server/manifests
system.activationScripts.k8s-manifests = {
deps = [ ];
text = ''
mkdir -p /var/lib/rancher/rke2/server/manifests
# # Base Configs
# cp ${../k8s/openebs.yaml} /var/lib/rancher/rke2/server/manifests/openebs-base.yaml
# cp ${../k8s/kasten.yaml} /var/lib/rancher/rke2/server/manifests/kasten-base.yaml
# Base Configs
cp ${../k8s/longhorn.yaml} /var/lib/rancher/rke2/server/manifests/longhorn-base.yaml
# cp ${../k8s/kasten.yaml} /var/lib/rancher/rke2/server/manifests/kasten-base.yaml
'';
};
# # OpenEBS Disk Pool
# cp ${pkgs.substituteAll {
# src = ../k8s/openebs-disk-pool.yaml;
# hostName = config.hostName;
# dataDiskID = config.dataDiskID;
# }} /var/lib/rancher/rke2/server/manifests/openebs-disk-pool-${config.hostName}.yaml
# '';
# };
# Add Symlinks Expected by Longhorn
system.activationScripts.add-symlinks = ''
mkdir -p /usr/bin
ln -sf ${pkgs.openiscsi}/bin/iscsiadm /usr/bin/iscsiadm
ln -sf ${pkgs.openiscsi}/bin/iscsid /usr/bin/iscsid
'';
};
}

164
k8s/ceph.yaml Normal file
View File

@ -0,0 +1,164 @@
---
# Namespace
apiVersion: v1
kind: Namespace
metadata:
labels:
name: rook-ceph
name: rook-ceph
---
# HelpChart
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
name: ceph
namespace: kube-system
spec:
repo: https://charts.rook.io/release
chart: rook-ceph
targetNamespace: rook-ceph
valuesContent: |-
enableDiscoveryDaemon: true
---
# CephCluster
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: rook-ceph
namespace: rook-ceph
spec:
dataDirHostPath: /var/lib/rook
cephVersion:
image: quay.io/ceph/ceph:v19.2
allowUnsupported: false
# HA - One monitor per node
mon:
count: 3
allowMultiplePerNode: false
# Ceph Dashboard
dashboard:
enabled: true
ssl: true
# Network Configuration
network:
provider: host
# Storage Configuration
storage:
useAllNodes: true
useAllDevices: true
config:
osdsPerDevice: "1"
replicatedSize: "3"
# Disruption Management
disruptionManagement:
managePodBudgets: true
osdMaintenanceTimeout: 30
# Resource Management
# resources:
# mgr:
# limits:
# cpu: "1000m"
# memory: "1Gi"
# requests:
# cpu: "500m"
# memory: "512Mi"
# mon:
# limits:
# cpu: "1000m"
# memory: "1Gi"
# requests:
# cpu: "500m"
# memory: "512Mi"
# osd:
# limits:
# cpu: "2000m"
# memory: "4Gi"
# requests:
# cpu: "1000m"
# memory: "2Gi"
---
# BlockPool - Single Replica
apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
name: ceph-block-pool-single
namespace: rook-ceph
spec:
failureDomain: host
replicated:
size: 1
---
# BlockPool - Three Replica
apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
name: ceph-block-pool-triple
namespace: rook-ceph
spec:
failureDomain: host
replicated:
size: 3
---
# StorageClass - Three Replica
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: ceph-block-triple
annotations:
storageclass.kubernetes.io/is-default-class: "true"
provisioner: rook-ceph.rbd.csi.ceph.com
parameters:
pool: ceph-block-pool-triple
clusterID: rook-ceph
imageFormat: "2"
imageFeatures: layering
# Ceph CSI driver
csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph
csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph
csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node
csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph
csi.storage.k8s.io/fstype: ext4
allowVolumeExpansion: true
volumeBindingMode: Immediate
reclaimPolicy: Delete
---
# StorageClass - Single Replica
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: ceph-block-single
provisioner: rook-ceph.rbd.csi.ceph.com
parameters:
pool: ceph-block-pool-single
clusterID: rook-ceph
imageFormat: "2"
imageFeatures: layering
# Ceph CSI driver
csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph
csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph
csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node
csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph
csi.storage.k8s.io/fstype: ext4
allowVolumeExpansion: true
volumeBindingMode: Immediate
reclaimPolicy: Delete

View File

@ -45,7 +45,39 @@ spec:
repo: https://charts.kasten.io/
chart: k10
targetNamespace: kasten
valuesContent: |-
global:
persistence:
storageClass: mayastor-r3
---
kind: Profile
apiVersion: config.kio.kasten.io/v1alpha1
metadata:
name: k10-backup-profile
namespace: kasten
spec:
locationSpec:
type: FileStore
fileStore:
claimName: va-unraid-backup-rw
credential:
secretType: ""
secret:
apiVersion: ""
kind: ""
name: ""
namespace: ""
type: Location
---
apiVersion: config.kio.kasten.io/v1alpha1
kind: TransformSet
metadata:
name: storage-class-rename
namespace: kasten
spec:
comment: Renames cstor-r1 to ceph-block-triple
transforms:
- json:
- op: replace
path: /spec/storageClassName
value: ceph-block-triple
name: StorageClassRename
subject:
name: ""
resource: persistentvolumeclaims

50
k8s/longhorn.yaml Normal file
View File

@ -0,0 +1,50 @@
---
# Namespace
apiVersion: v1
kind: Namespace
metadata:
labels:
name: longhorn
name: longhorn
---
# HelpChart
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
name: longhorn
namespace: kube-system
spec:
repo: https://charts.longhorn.io
chart: longhorn
targetNamespace: longhorn
valuesContent: |-
persistence:
defaultClass: true
defaultClassReplicaCount: 3
reclaimPolicy: Delete
defaultSettings:
defaultDataPath: /storage/longhorn
defaultReplicaCount: 3
nodeDownPodDeletionPolicy: delete-both-statefulset-and-deployment-pod
guaranteedEngineManagerCPU: 0.25
guaranteedReplicaManagerCPU: 0.25
longhornManager:
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"
---
# StorageClass
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: longhorn-block-triple
provisioner: driver.longhorn.io
allowVolumeExpansion: true
parameters:
numberOfReplicas: "3"
staleReplicaTimeout: "2880"
fsType: "ext4"