wip
This commit is contained in:
@@ -464,6 +464,7 @@ in
|
|||||||
vllmCmd = ''
|
vllmCmd = ''
|
||||||
set -e; pip install xxhash pandas scipy -q;
|
set -e; pip install xxhash pandas scipy -q;
|
||||||
python3 -m vllm._genesis.patches.apply_all;
|
python3 -m vllm._genesis.patches.apply_all;
|
||||||
|
python3 /patches/patch_workspace_lock_disable.py;
|
||||||
python3 /patches/patch_tolist_cudagraph.py;
|
python3 /patches/patch_tolist_cudagraph.py;
|
||||||
python3 /patches/patch_timings_07351e088.py;
|
python3 /patches/patch_timings_07351e088.py;
|
||||||
exec vllm serve
|
exec vllm serve
|
||||||
@@ -513,10 +514,11 @@ in
|
|||||||
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
||||||
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_tolist_cudagraph.py:/patches/patch_tolist_cudagraph.py:ro \
|
-v /mnt/ssd/vLLM/Patches/patch_tolist_cudagraph.py:/patches/patch_tolist_cudagraph.py:ro \
|
||||||
|
-v /mnt/ssd/vLLM/Patches/patch_workspace_lock_disable.py:/patches/patch_workspace_lock_disable.py:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \
|
-v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \
|
||||||
-p ''${PORT}:8000 \
|
-p ''${PORT}:8000 \
|
||||||
--entrypoint /bin/bash \
|
--entrypoint /bin/bash \
|
||||||
vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08 \
|
vllm/vllm-openai:nightly-7a1eb8ac2ec4ea69338c51dc7afd4b15010abfa8 \
|
||||||
-c "${vllmCmdFlat}"
|
-c "${vllmCmdFlat}"
|
||||||
'';
|
'';
|
||||||
cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}";
|
cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}";
|
||||||
@@ -534,7 +536,7 @@ in
|
|||||||
# TurboQuant 3-bit KV + MTP n=3 + PN12/P104 cliff-closure stack
|
# TurboQuant 3-bit KV + MTP n=3 + PN12/P104 cliff-closure stack
|
||||||
"vllm-qwen3.6-27b-long-vision" = {
|
"vllm-qwen3.6-27b-long-vision" = {
|
||||||
name = "vLLM Qwen3.6 (27B) - Long Vision";
|
name = "vLLM Qwen3.6 (27B) - Long Vision";
|
||||||
macros.ctx = "140000";
|
macros.ctx = "198000";
|
||||||
proxy = "http://127.0.0.1:\${PORT}";
|
proxy = "http://127.0.0.1:\${PORT}";
|
||||||
cmd =
|
cmd =
|
||||||
let
|
let
|
||||||
@@ -542,8 +544,8 @@ in
|
|||||||
set -e; pip install xxhash pandas scipy -q;
|
set -e; pip install xxhash pandas scipy -q;
|
||||||
python3 -m vllm._genesis.patches.apply_all;
|
python3 -m vllm._genesis.patches.apply_all;
|
||||||
python3 /patches/patch_pn12_ffn_pool_anchor.py;
|
python3 /patches/patch_pn12_ffn_pool_anchor.py;
|
||||||
python3 /patches/patch_pn12_compile_safe_custom_op.py;
|
|
||||||
python3 /patches/patch_fa_max_seqlen_clamp.py;
|
python3 /patches/patch_fa_max_seqlen_clamp.py;
|
||||||
|
python3 /patches/patch_workspace_lock_disable.py;
|
||||||
python3 /patches/patch_tolist_cudagraph.py;
|
python3 /patches/patch_tolist_cudagraph.py;
|
||||||
python3 /patches/patch_timings_07351e088.py;
|
python3 /patches/patch_timings_07351e088.py;
|
||||||
exec vllm serve
|
exec vllm serve
|
||||||
@@ -553,7 +555,7 @@ in
|
|||||||
--dtype float16
|
--dtype float16
|
||||||
--tensor-parallel-size 1
|
--tensor-parallel-size 1
|
||||||
--max-model-len ''${ctx}
|
--max-model-len ''${ctx}
|
||||||
--gpu-memory-utilization 0.95
|
--gpu-memory-utilization 0.98
|
||||||
--max-num-seqs 1
|
--max-num-seqs 1
|
||||||
--max-num-batched-tokens 4128
|
--max-num-batched-tokens 4128
|
||||||
--kv-cache-dtype turboquant_3bit_nc
|
--kv-cache-dtype turboquant_3bit_nc
|
||||||
@@ -596,17 +598,16 @@ in
|
|||||||
-e GENESIS_ENABLE_PN12_FFN_INTERMEDIATE_POOL=1 \
|
-e GENESIS_ENABLE_PN12_FFN_INTERMEDIATE_POOL=1 \
|
||||||
-e GENESIS_ENABLE_PN13_CUDA_GRAPH_LAMBDA_ARITY=1 \
|
-e GENESIS_ENABLE_PN13_CUDA_GRAPH_LAMBDA_ARITY=1 \
|
||||||
-e GENESIS_ENABLE_FA_MAX_SEQLEN_CLAMP=1 \
|
-e GENESIS_ENABLE_FA_MAX_SEQLEN_CLAMP=1 \
|
||||||
-e GENESIS_ENABLE_PN17_FA2_LSE_CLAMP=1 \
|
|
||||||
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
-v /mnt/ssd/vLLM/Models:/root/.cache/huggingface \
|
||||||
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
-v /mnt/ssd/vLLM/Patches/genesis/vllm/_genesis:/usr/local/lib/python3.12/dist-packages/vllm/_genesis:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_tolist_cudagraph.py:/patches/patch_tolist_cudagraph.py:ro \
|
-v /mnt/ssd/vLLM/Patches/patch_tolist_cudagraph.py:/patches/patch_tolist_cudagraph.py:ro \
|
||||||
|
-v /mnt/ssd/vLLM/Patches/patch_workspace_lock_disable.py:/patches/patch_workspace_lock_disable.py:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_pn12_ffn_pool_anchor.py:/patches/patch_pn12_ffn_pool_anchor.py:ro \
|
-v /mnt/ssd/vLLM/Patches/patch_pn12_ffn_pool_anchor.py:/patches/patch_pn12_ffn_pool_anchor.py:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_pn12_compile_safe_custom_op.py:/patches/patch_pn12_compile_safe_custom_op.py:ro \
|
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_fa_max_seqlen_clamp.py:/patches/patch_fa_max_seqlen_clamp.py:ro \
|
-v /mnt/ssd/vLLM/Patches/patch_fa_max_seqlen_clamp.py:/patches/patch_fa_max_seqlen_clamp.py:ro \
|
||||||
-v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \
|
-v /mnt/ssd/vLLM/Patches/patch_timings_07351e088.py:/patches/patch_timings_07351e088.py:ro \
|
||||||
-p ''${PORT}:8000 \
|
-p ''${PORT}:8000 \
|
||||||
--entrypoint /bin/bash \
|
--entrypoint /bin/bash \
|
||||||
vllm/vllm-openai:nightly-07351e0883470724dd5a7e9730ed10e01fc99d08 \
|
vllm/vllm-openai:nightly-7a1eb8ac2ec4ea69338c51dc7afd4b15010abfa8 \
|
||||||
-c "${vllmCmdFlat}"
|
-c "${vllmCmdFlat}"
|
||||||
'';
|
'';
|
||||||
cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}";
|
cmdStop = "${pkgs.docker}/bin/docker stop \${MODEL_ID}";
|
||||||
|
|||||||
Reference in New Issue
Block a user