From 66a47b1338387fc684e923998063f0b4a45f0236 Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Mon, 27 Jan 2025 20:21:43 -0500 Subject: [PATCH] yay --- README.md | 13 +++- flake.nix | 2 +- hosts/llama-server.nix | 40 ++++++++-- hosts/rke2-ceph.nix | 147 ++++++++++++++++++++++++++++++++++++ hosts/rke2-openebs.nix | 162 ++++++++++++++++++++++++++++++++++++++++ hosts/rke2.nix | 83 +++++++++++++-------- k8s/ceph.yaml | 164 +++++++++++++++++++++++++++++++++++++++++ k8s/kasten.yaml | 40 +++++++++- k8s/longhorn.yaml | 50 +++++++++++++ 9 files changed, 654 insertions(+), 47 deletions(-) create mode 100644 hosts/rke2-ceph.nix create mode 100644 hosts/rke2-openebs.nix create mode 100644 k8s/ceph.yaml create mode 100644 k8s/longhorn.yaml diff --git a/README.md b/README.md index 959729a..9b2a7e4 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ scp -r * root@10.10.10.10:/etc/nixos ls -l /dev/disk/by-id # Partition Disk +# WARNING: This will destroy all data on the disk(s) sudo nix \ --experimental-features "nix-command flakes" \ run github:nix-community/disko -- \ @@ -33,7 +34,7 @@ sudo reboot ## Copy Config Back to Host ```bash -scp -r * nixos@10.10.10.10:/etc/nixos +scp -r * nixos@10.0.20.201:/etc/nixos ``` ## Rebuild NixOS @@ -52,10 +53,14 @@ sudo nixos-install --flake /etc/nixos#lin-va-rke1 cat /var/lib/rancher/rke2/server/node-token # Deploy Following Nodes -echo "" > ./k8s/rke2-token +echo "" > rke2-token sudo nixos-install --flake /etc/nixos#lin-va-rke2 ``` -## TODO +## Notes -OpenEBS DiskPool Configuration not being applied. Likely need to consolidate RKE2 config, generate DiskPool config in complete, then apply. +## Kasten Port Forward + +```bash +kubectl port-forward -n kasten svc/gateway 8000:80 +``` diff --git a/flake.nix b/flake.nix index 8a308c5..4bf4ccd 100644 --- a/flake.nix +++ b/flake.nix @@ -26,7 +26,7 @@ systemConfig = ./hosts/llama-server.nix; moduleConfig = { hostName = "lin-va-llama1"; - mainDiskID = "/dev/sda"; + mainDiskID = "/dev/disk/by-id/ata-MTFDDAK512MBF-1AN1ZABHA_161212233628"; }; }; diff --git a/hosts/llama-server.nix b/hosts/llama-server.nix index 0a2210b..4d7e0cc 100644 --- a/hosts/llama-server.nix +++ b/hosts/llama-server.nix @@ -5,7 +5,7 @@ let cudaSupport = true; }).overrideAttrs (oldAttrs: { cmakeFlags = oldAttrs.cmakeFlags ++ [ - "-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=ON" + "-DGGML_CUDA_ENABLE_UNIFIED_MEMORY=1" # Disable CPU Instructions - Intel(R) Core(TM) i5-3570K CPU @ 3.40GHz "-DLLAMA_FMA=OFF" @@ -19,9 +19,16 @@ let # Define Model Vars modelDir = "/models"; - modelName = "qwen2.5-coder-7b-q8_0.gguf"; + + # 7B + # modelName = "qwen2.5-coder-7b-q8_0.gguf"; + # modelUrl = "https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF/resolve/main/${modelName}?download=true"; + + # 3B + modelName = "qwen2.5-coder-3b-q8_0.gguf"; + modelUrl = "https://huggingface.co/ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF/resolve/main/${modelName}?download=true"; + modelPath = "${modelDir}/${modelName}"; - modelUrl = "https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF/resolve/main/${modelName}?download=true"; in { @@ -97,18 +104,35 @@ in model = modelPath; port = 8080; openFirewall = true; + + # 7B + # extraFlags = [ + # "-ngl" + # "99" + # "-fa" + # "-ub" + # "512" + # "-b" + # "512" + # "-dt" + # "0.1" + # "--ctx-size" + # "4096" + # "--cache-reuse" + # "256" + # ]; + + # 3B extraFlags = [ "-ngl" "99" "-fa" "-ub" - "512" + "1024" "-b" - "512" - "-dt" - "0.1" + "1024" "--ctx-size" - "4096" + "0" "--cache-reuse" "256" ]; diff --git a/hosts/rke2-ceph.nix b/hosts/rke2-ceph.nix new file mode 100644 index 0000000..de2013a --- /dev/null +++ b/hosts/rke2-ceph.nix @@ -0,0 +1,147 @@ +{ config, pkgs, lib, ... }: + +{ + # Node Nix Config + options = { + dataDiskID = lib.mkOption { + type = lib.types.str; + description = "The device ID for the data disk"; + }; + serverAddr = lib.mkOption { + type = lib.types.str; + description = "The server to join"; + default = ""; + }; + networkConfig = lib.mkOption { + type = lib.types.submodule { + options = { + interface = lib.mkOption { + type = lib.types.str; + description = "Network interface name"; + example = "enp0s3"; + }; + address = lib.mkOption { + type = lib.types.str; + description = "Static IP address"; + example = "10.0.20.200"; + }; + defaultGateway = lib.mkOption { + type = lib.types.str; + description = "Default gateway IP"; + example = "10.0.20.254"; + }; + nameservers = lib.mkOption { + type = lib.types.listOf lib.types.str; + description = "List of DNS servers"; + example = [ "10.0.20.254" "8.8.8.8" ]; + default = [ "8.8.8.8" "8.8.4.4" ]; + }; + }; + }; + description = "Network configuration"; + }; + }; + + config = { + # ---------------------------------------- + # ---------- Base Configuration ---------- + # ---------------------------------------- + + # Ceph Requirements + boot.kernelModules = [ "rbd" ]; + + # Network Configuration + networking = { + hostName = config.hostName; + networkmanager.enable = false; + + # Interface Configuration + inherit (config.networkConfig) defaultGateway nameservers; + interfaces.${config.networkConfig.interface}.ipv4.addresses = [{ + inherit (config.networkConfig) address; + prefixLength = 24; + }]; + + firewall = { + enable = true; + + allowedTCPPorts = [ + # RKE2 Ports - https://docs.rke2.io/install/requirements#networking + 6443 # Kubernetes API + 9345 # RKE2 supervisor API + 2379 # etcd Client Port + 2380 # etcd Peer Port + 2381 # etcd Metrics Port + 10250 # kubelet metrics + 9099 # Canal CNI health checks + + # Ceph Ports + 3300 # Ceph MON daemon + 6789 # Ceph MON service + ] ++ lib.range 6800 7300; # Ceph OSD range + + allowedUDPPorts = [ + # RKE2 Ports - https://docs.rke2.io/install/requirements#networking + 8472 # Canal CNI with VXLAN + # 51820 # Canal CNI with WireGuard IPv4 (if using encryption) + # 51821 # Canal CNI with WireGuard IPv6 (if using encryption) + ]; + }; + }; + + # System Packages + environment.systemPackages = with pkgs; [ + htop + k9s + kubectl + kubernetes-helm + nfs-utils + tmux + vim + ]; + + # ---------------------------------------- + # ---------- RKE2 Configuration ---------- + # ---------------------------------------- + + # RKE2 Join Token + environment.etc."rancher/rke2/node-token" = lib.mkIf (config.serverAddr != "") { + source = ../rke2-token; + mode = "0600"; + user = "root"; + group = "root"; + }; + + # Enable RKE2 + services.rke2 = { + enable = true; + role = "server"; + + disable = [ + # Disable - Utilizing Traefik + "rke2-ingress-nginx" + + # Distable - Utilizing OpenEBS's Snapshot Controller + "rke2-snapshot-controller" + "rke2-snapshot-controller-crd" + "rke2-snapshot-validation-webhook" + ]; + + } // lib.optionalAttrs (config.serverAddr != "") { + serverAddr = config.serverAddr; + tokenFile = "/etc/rancher/rke2/node-token"; + }; + + # Bootstrap Kubernetes Manifests + system.activationScripts.k8s-manifests = { + deps = [ ]; + text = '' + mkdir -p /var/lib/rancher/rke2/server/manifests + + # Base Configs + cp ${../k8s/ceph.yaml} /var/lib/rancher/rke2/server/manifests/ceph-base.yaml + cp ${../k8s/kasten.yaml} /var/lib/rancher/rke2/server/manifests/kasten-base.yaml + ''; + }; + }; +} diff --git a/hosts/rke2-openebs.nix b/hosts/rke2-openebs.nix new file mode 100644 index 0000000..fc437f3 --- /dev/null +++ b/hosts/rke2-openebs.nix @@ -0,0 +1,162 @@ +{ config, pkgs, lib, ... }: + +{ + # Node Nix Config + options = { + dataDiskID = lib.mkOption { + type = lib.types.str; + description = "The device ID for the data disk"; + }; + serverAddr = lib.mkOption { + type = lib.types.str; + description = "The server to join"; + default = ""; + }; + networkConfig = lib.mkOption { + type = lib.types.submodule { + options = { + interface = lib.mkOption { + type = lib.types.str; + description = "Network interface name"; + example = "enp0s3"; + }; + address = lib.mkOption { + type = lib.types.str; + description = "Static IP address"; + example = "10.0.20.200"; + }; + defaultGateway = lib.mkOption { + type = lib.types.str; + description = "Default gateway IP"; + example = "10.0.20.254"; + }; + nameservers = lib.mkOption { + type = lib.types.listOf lib.types.str; + description = "List of DNS servers"; + example = [ "10.0.20.254" "8.8.8.8" ]; + default = [ "8.8.8.8" "8.8.4.4" ]; + }; + }; + }; + description = "Network configuration"; + }; + }; + + config = { + # ---------------------------------------- + # ---------- Base Configuration ---------- + # ---------------------------------------- + + # OpenEBS Mayastor Requirements + boot.kernelModules = [ "nvme_tcp" ]; + boot.kernel.sysctl = { + "vm.nr_hugepages" = 1024; + }; + + # Network Configuration + networking = { + hostName = config.hostName; + networkmanager.enable = false; + + # Interface Configuration + inherit (config.networkConfig) defaultGateway nameservers; + interfaces.${config.networkConfig.interface}.ipv4.addresses = [{ + inherit (config.networkConfig) address; + prefixLength = 24; + }]; + + firewall = { + enable = true; + + allowedTCPPorts = [ + # RKE2 Ports - https://docs.rke2.io/install/requirements#networking + 6443 # Kubernetes API + 9345 # RKE2 supervisor API + 2379 # etcd Client Port + 2380 # etcd Peer Port + 2381 # etcd Metrics Port + 10250 # kubelet metrics + 9099 # Canal CNI health checks + + # OpenEBS Mayastor - https://openebs.io/docs/user-guides/replicated-storage-user-guide/replicated-pv-mayastor/rs-installation#network-requirements + 10124 # REST API + 8420 # NVMf + 4421 # NVMf + ]; + + allowedUDPPorts = [ + # RKE2 Ports - https://docs.rke2.io/install/requirements#networking + 8472 # Canal CNI with VXLAN + # 51820 # Canal CNI with WireGuard IPv4 (if using encryption) + # 51821 # Canal CNI with WireGuard IPv6 (if using encryption) + ]; + }; + }; + + # System Packages + environment.systemPackages = with pkgs; [ + htop + k9s + kubectl + kubernetes-helm + nfs-utils + vim + ]; + + # ---------------------------------------- + # ---------- RKE2 Configuration ---------- + # ---------------------------------------- + + # RKE2 Join Token + environment.etc."rancher/rke2/node-token" = lib.mkIf (config.serverAddr != "") { + source = ../rke2-token; + mode = "0600"; + user = "root"; + group = "root"; + }; + + # Enable RKE2 + services.rke2 = { + enable = true; + role = "server"; + + disable = [ + # Disable - Utilizing Traefik + "rke2-ingress-nginx" + + # Distable - Utilizing OpenEBS's Snapshot Controller + "rke2-snapshot-controller" + "rke2-snapshot-controller-crd" + "rke2-snapshot-validation-webhook" + ]; + + # OpenEBS Scheduleable + nodeLabel = [ + "openebs.io/engine=mayastor" + ]; + + } // lib.optionalAttrs (config.serverAddr != "") { + serverAddr = config.serverAddr; + tokenFile = "/etc/rancher/rke2/node-token"; + }; + + # Bootstrap Kubernetes Manifests + system.activationScripts.k8s-manifests = { + deps = [ ]; + text = '' + mkdir -p /var/lib/rancher/rke2/server/manifests + + # Base Configs + cp ${../k8s/openebs.yaml} /var/lib/rancher/rke2/server/manifests/openebs-base.yaml + cp ${../k8s/kasten.yaml} /var/lib/rancher/rke2/server/manifests/kasten-base.yaml + + # OpenEBS Disk Pool + cp ${pkgs.substituteAll { + src = ../k8s/openebs-disk-pool.yaml; + hostName = config.hostName; + dataDiskID = config.dataDiskID; + }} /var/lib/rancher/rke2/server/manifests/openebs-disk-pool-${config.hostName}.yaml + ''; + }; + }; +} diff --git a/hosts/rke2.nix b/hosts/rke2.nix index 0f9e27f..ed7f42d 100644 --- a/hosts/rke2.nix +++ b/hosts/rke2.nix @@ -47,10 +47,33 @@ # ---------- Base Configuration ---------- # ---------------------------------------- - # OpenEBS Mayastor Requirements - boot.kernelModules = [ "nvme_tcp" ]; - boot.kernel.sysctl = { - "vm.nr_hugepages" = 1024; + # Longhorn Requirements + boot.kernelModules = [ + "iscsi_tcp" + "dm_crypt" + ]; + + # Longhorn Data Disk + disko.devices = { + disk.longhorn = { + type = "disk"; + device = config.dataDiskID; + content = { + type = "gpt"; + partitions = { + longhorn = { + size = "100%"; + content = { + type = "filesystem"; + format = "xfs"; + mountpoint = "/storage/longhorn"; + mountOptions = [ "defaults" "nofail" ]; + extraArgs = [ "-d" "su=128k,sw=8" ]; + }; + }; + }; + }; + }; }; # Network Configuration @@ -78,10 +101,8 @@ 10250 # kubelet metrics 9099 # Canal CNI health checks - # OpenEBS Mayastor - https://openebs.io/docs/user-guides/replicated-storage-user-guide/replicated-pv-mayastor/rs-installation#network-requirements - 10124 # REST API - 8420 # NVMf - 4421 # NVMf + # iSCSI Port + 3260 ]; allowedUDPPorts = [ @@ -100,6 +121,8 @@ kubectl kubernetes-helm nfs-utils + openiscsi + tmux vim ]; @@ -124,39 +147,39 @@ # Disable - Utilizing Traefik "rke2-ingress-nginx" - # Distable - Utilizing OpenEBS's Snapshot Controller + # Disable - Utilizing Longhorn's Snapshot Controller "rke2-snapshot-controller" "rke2-snapshot-controller-crd" "rke2-snapshot-validation-webhook" ]; - - # OpenEBS Scheduleable - nodeLabel = [ - "openebs.io/engine=mayastor" - ]; - } // lib.optionalAttrs (config.serverAddr != "") { serverAddr = config.serverAddr; tokenFile = "/etc/rancher/rke2/node-token"; }; + # Enable OpeniSCSI + services.openiscsi = { + enable = true; + name = "iqn.2025-01.${config.hostName}:initiator"; + }; + # Bootstrap Kubernetes Manifests - # system.activationScripts.k8s-manifests = { - # deps = [ ]; - # text = '' - # mkdir -p /var/lib/rancher/rke2/server/manifests + system.activationScripts.k8s-manifests = { + deps = [ ]; + text = '' + mkdir -p /var/lib/rancher/rke2/server/manifests - # # Base Configs - # cp ${../k8s/openebs.yaml} /var/lib/rancher/rke2/server/manifests/openebs-base.yaml - # cp ${../k8s/kasten.yaml} /var/lib/rancher/rke2/server/manifests/kasten-base.yaml + # Base Configs + cp ${../k8s/longhorn.yaml} /var/lib/rancher/rke2/server/manifests/longhorn-base.yaml + # cp ${../k8s/kasten.yaml} /var/lib/rancher/rke2/server/manifests/kasten-base.yaml + ''; + }; - # # OpenEBS Disk Pool - # cp ${pkgs.substituteAll { - # src = ../k8s/openebs-disk-pool.yaml; - # hostName = config.hostName; - # dataDiskID = config.dataDiskID; - # }} /var/lib/rancher/rke2/server/manifests/openebs-disk-pool-${config.hostName}.yaml - # ''; - # }; + # Add Symlinks Expected by Longhorn + system.activationScripts.add-symlinks = '' + mkdir -p /usr/bin + ln -sf ${pkgs.openiscsi}/bin/iscsiadm /usr/bin/iscsiadm + ln -sf ${pkgs.openiscsi}/bin/iscsid /usr/bin/iscsid + ''; }; } diff --git a/k8s/ceph.yaml b/k8s/ceph.yaml new file mode 100644 index 0000000..cb81ca3 --- /dev/null +++ b/k8s/ceph.yaml @@ -0,0 +1,164 @@ +--- +# Namespace +apiVersion: v1 +kind: Namespace +metadata: + labels: + name: rook-ceph + name: rook-ceph + +--- +# HelpChart +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: ceph + namespace: kube-system +spec: + repo: https://charts.rook.io/release + chart: rook-ceph + targetNamespace: rook-ceph + valuesContent: |- + enableDiscoveryDaemon: true + +--- +# CephCluster +apiVersion: ceph.rook.io/v1 +kind: CephCluster +metadata: + name: rook-ceph + namespace: rook-ceph +spec: + dataDirHostPath: /var/lib/rook + cephVersion: + image: quay.io/ceph/ceph:v19.2 + allowUnsupported: false + + # HA - One monitor per node + mon: + count: 3 + allowMultiplePerNode: false + + # Ceph Dashboard + dashboard: + enabled: true + ssl: true + + # Network Configuration + network: + provider: host + + # Storage Configuration + storage: + useAllNodes: true + useAllDevices: true + config: + osdsPerDevice: "1" + replicatedSize: "3" + + # Disruption Management + disruptionManagement: + managePodBudgets: true + osdMaintenanceTimeout: 30 + + # Resource Management + # resources: + # mgr: + # limits: + # cpu: "1000m" + # memory: "1Gi" + # requests: + # cpu: "500m" + # memory: "512Mi" + # mon: + # limits: + # cpu: "1000m" + # memory: "1Gi" + # requests: + # cpu: "500m" + # memory: "512Mi" + # osd: + # limits: + # cpu: "2000m" + # memory: "4Gi" + # requests: + # cpu: "1000m" + # memory: "2Gi" + +--- +# BlockPool - Single Replica +apiVersion: ceph.rook.io/v1 +kind: CephBlockPool +metadata: + name: ceph-block-pool-single + namespace: rook-ceph +spec: + failureDomain: host + replicated: + size: 1 + +--- +# BlockPool - Three Replica +apiVersion: ceph.rook.io/v1 +kind: CephBlockPool +metadata: + name: ceph-block-pool-triple + namespace: rook-ceph +spec: + failureDomain: host + replicated: + size: 3 + +--- +# StorageClass - Three Replica +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ceph-block-triple + annotations: + storageclass.kubernetes.io/is-default-class: "true" +provisioner: rook-ceph.rbd.csi.ceph.com +parameters: + pool: ceph-block-pool-triple + clusterID: rook-ceph + imageFormat: "2" + imageFeatures: layering + + # Ceph CSI driver + csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner + csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph + csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner + csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph + csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node + csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph + csi.storage.k8s.io/fstype: ext4 + +allowVolumeExpansion: true +volumeBindingMode: Immediate +reclaimPolicy: Delete + +--- +# StorageClass - Single Replica +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ceph-block-single +provisioner: rook-ceph.rbd.csi.ceph.com +parameters: + pool: ceph-block-pool-single + clusterID: rook-ceph + imageFormat: "2" + imageFeatures: layering + + # Ceph CSI driver + csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner + csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph + csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner + csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph + csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node + csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph + csi.storage.k8s.io/fstype: ext4 + +allowVolumeExpansion: true +volumeBindingMode: Immediate +reclaimPolicy: Delete diff --git a/k8s/kasten.yaml b/k8s/kasten.yaml index 6e635a3..ae38d63 100644 --- a/k8s/kasten.yaml +++ b/k8s/kasten.yaml @@ -45,7 +45,39 @@ spec: repo: https://charts.kasten.io/ chart: k10 targetNamespace: kasten - valuesContent: |- - global: - persistence: - storageClass: mayastor-r3 +--- +kind: Profile +apiVersion: config.kio.kasten.io/v1alpha1 +metadata: + name: k10-backup-profile + namespace: kasten +spec: + locationSpec: + type: FileStore + fileStore: + claimName: va-unraid-backup-rw + credential: + secretType: "" + secret: + apiVersion: "" + kind: "" + name: "" + namespace: "" + type: Location +--- +apiVersion: config.kio.kasten.io/v1alpha1 +kind: TransformSet +metadata: + name: storage-class-rename + namespace: kasten +spec: + comment: Renames cstor-r1 to ceph-block-triple + transforms: + - json: + - op: replace + path: /spec/storageClassName + value: ceph-block-triple + name: StorageClassRename + subject: + name: "" + resource: persistentvolumeclaims diff --git a/k8s/longhorn.yaml b/k8s/longhorn.yaml new file mode 100644 index 0000000..3ec96b2 --- /dev/null +++ b/k8s/longhorn.yaml @@ -0,0 +1,50 @@ +--- +# Namespace +apiVersion: v1 +kind: Namespace +metadata: + labels: + name: longhorn + name: longhorn + +--- +# HelpChart +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: longhorn + namespace: kube-system +spec: + repo: https://charts.longhorn.io + chart: longhorn + targetNamespace: longhorn + valuesContent: |- + persistence: + defaultClass: true + defaultClassReplicaCount: 3 + reclaimPolicy: Delete + + defaultSettings: + defaultDataPath: /storage/longhorn + defaultReplicaCount: 3 + nodeDownPodDeletionPolicy: delete-both-statefulset-and-deployment-pod + guaranteedEngineManagerCPU: 0.25 + guaranteedReplicaManagerCPU: 0.25 + + longhornManager: + tolerations: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" +--- +# StorageClass +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: longhorn-block-triple +provisioner: driver.longhorn.io +allowVolumeExpansion: true +parameters: + numberOfReplicas: "3" + staleReplicaTimeout: "2880" + fsType: "ext4"