Skip to content

Instantly share code, notes, and snippets.

@sozercan
Created December 8, 2025 21:28
Show Gist options
  • Select an option

  • Save sozercan/74192101363d83f2b4aadcaa350e9db4 to your computer and use it in GitHub Desktop.

Select an option

Save sozercan/74192101363d83f2b4aadcaa350e9db4 to your computer and use it in GitHub Desktop.
apiVersion: ray.io/v1
kind: RayService
metadata:
name: qwen-disaggregated
spec:
serveConfigV2: |
applications:
- name: pd-disaggregation
import_path: ray.serve.llm:build_pd_openai_app
route_prefix: "/"
args:
prefill_config:
model_loading_config:
model_id: "qwen"
model_source: "Qwen/Qwen3-0.6B"
deployment_config:
autoscaling_config:
min_replicas: 1
max_replicas: 2
ray_actor_options:
resources:
prefill_node: 1
engine_kwargs:
tensor_parallel_size: 1
gpu_memory_utilization: 0.9
dtype: "auto"
max_num_seqs: 40
max_model_len: 16384
enable_chunked_prefill: true
enable_prefix_caching: true
kv_transfer_config:
kv_connector: "NixlConnector"
kv_role: "kv_producer"
decode_config:
model_loading_config:
model_id: "qwen"
model_source: "Qwen/Qwen3-0.6B"
deployment_config:
autoscaling_config:
min_replicas: 1
max_replicas: 2
ray_actor_options:
resources:
decode_node: 1
engine_kwargs:
tensor_parallel_size: 1
gpu_memory_utilization: 0.9
dtype: "auto"
max_num_seqs: 40
max_model_len: 16384
enable_chunked_prefill: true
enable_prefix_caching: true
kv_transfer_config:
kv_connector: "NixlConnector"
kv_role: "kv_consumer"
rayClusterConfig:
headGroupSpec:
rayStartParams:
num-gpus: "0"
template:
spec:
containers:
- name: ray-head
image: rayproject/ray-llm:2.52.0-py311-cu128
resources:
limits:
cpu: "4"
memory: "32Gi"
requests:
cpu: "4"
memory: "32Gi"
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
workerGroupSpecs:
# Prefill worker group - handles prompt processing
- groupName: prefill-group
replicas: 1
minReplicas: 1
maxReplicas: 2
rayStartParams:
resources: '"{\"prefill_node\": 1}"'
template:
spec:
containers:
- name: ray-worker
image: rayproject/ray-llm:2.52.0-py311-cu128
resources:
limits:
cpu: "8"
memory: "64Gi"
nvidia.com/gpu: "1"
requests:
cpu: "8"
memory: "64Gi"
nvidia.com/gpu: "1"
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
# Decode worker group - handles token generation
- groupName: decode-group
replicas: 1
minReplicas: 1
maxReplicas: 2
rayStartParams:
resources: '"{\"decode_node\": 1}"'
template:
spec:
containers:
- name: ray-worker
image: rayproject/ray-llm:2.52.0-py311-cu128
resources:
limits:
cpu: "8"
memory: "64Gi"
nvidia.com/gpu: "1"
requests:
cpu: "8"
memory: "64Gi"
nvidia.com/gpu: "1"
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment