sozercan · December 8, 2025 21:28
diff --git a/gistfile1.txt b/gistfile1.txt
 apiVersion: ray.io/v1
 kind: RayService
 metadata:
  name: qwen-disaggregated
 spec:
  serveConfigV2: |
    applications:
    - name: pd-disaggregation
      import_path: ray.serve.llm:build_pd_openai_app
      route_prefix: "/"
      args:
        prefill_config:
          model_loading_config:
            model_id: "qwen"
            model_source: "Qwen/Qwen3-0.6B"
          deployment_config:
            autoscaling_config:
              min_replicas: 1
              max_replicas: 2
            ray_actor_options:
              resources:
                prefill_node: 1
          engine_kwargs:
            tensor_parallel_size: 1
            gpu_memory_utilization: 0.9
            dtype: "auto"
            max_num_seqs: 40
            max_model_len: 16384
            enable_chunked_prefill: true
            enable_prefix_caching: true
            kv_transfer_config:
              kv_connector: "NixlConnector"
              kv_role: "kv_producer"
        decode_config:
          model_loading_config:
            model_id: "qwen"
            model_source: "Qwen/Qwen3-0.6B"
          deployment_config:
            autoscaling_config:
              min_replicas: 1
              max_replicas: 2
            ray_actor_options:
              resources:
                decode_node: 1
          engine_kwargs:
            tensor_parallel_size: 1
            gpu_memory_utilization: 0.9
            dtype: "auto"
            max_num_seqs: 40
            max_model_len: 16384
            enable_chunked_prefill: true
            enable_prefix_caching: true
            kv_transfer_config:
              kv_connector: "NixlConnector"
              kv_role: "kv_consumer"
  rayClusterConfig:
    headGroupSpec:
      rayStartParams:
        num-gpus: "0"
      template:
        spec:
          containers:
          - name: ray-head
            image: rayproject/ray-llm:2.52.0-py311-cu128
            resources:
              limits:
                cpu: "4"
                memory: "32Gi"
              requests:
                cpu: "4"
                memory: "32Gi"
            ports:
            - containerPort: 6379
              name: gcs-server
            - containerPort: 8265
              name: dashboard
            - containerPort: 10001
              name: client
            - containerPort: 8000
              name: serve
    workerGroupSpecs:
    # Prefill worker group - handles prompt processing
    - groupName: prefill-group
      replicas: 1
      minReplicas: 1
      maxReplicas: 2
      rayStartParams:
        resources: '"{\"prefill_node\": 1}"'
      template:
        spec:
          containers:
          - name: ray-worker
            image: rayproject/ray-llm:2.52.0-py311-cu128
            resources:
              limits:
                cpu: "8"
                memory: "64Gi"
                nvidia.com/gpu: "1"
              requests:
                cpu: "8"
                memory: "64Gi"
                nvidia.com/gpu: "1"
          tolerations:
          - key: "nvidia.com/gpu"
            operator: "Exists"
            effect: "NoSchedule"
    # Decode worker group - handles token generation
    - groupName: decode-group
      replicas: 1
      minReplicas: 1
      maxReplicas: 2
      rayStartParams:
        resources: '"{\"decode_node\": 1}"'
      template:
        spec:
          containers:
          - name: ray-worker
            image: rayproject/ray-llm:2.52.0-py311-cu128
            resources:
              limits:
                cpu: "8"
                memory: "64Gi"
                nvidia.com/gpu: "1"
              requests:
                cpu: "8"
                memory: "64Gi"
                nvidia.com/gpu: "1"
          tolerations:
          - key: "nvidia.com/gpu"
            operator: "Exists"
            effect: "NoSchedule"
	apiVersion: ray.io/v1
	kind: RayService
	metadata:
	name: qwen-disaggregated
	spec:
	serveConfigV2: \|
	applications:
	- name: pd-disaggregation
	import_path: ray.serve.llm:build_pd_openai_app
	route_prefix: "/"
	args:
	prefill_config:
	model_loading_config:
	model_id: "qwen"
	model_source: "Qwen/Qwen3-0.6B"
	deployment_config:
	autoscaling_config:
	min_replicas: 1
	max_replicas: 2
	ray_actor_options:
	resources:
	prefill_node: 1
	engine_kwargs:
	tensor_parallel_size: 1
	gpu_memory_utilization: 0.9
	dtype: "auto"
	max_num_seqs: 40
	max_model_len: 16384
	enable_chunked_prefill: true
	enable_prefix_caching: true
	kv_transfer_config:
	kv_connector: "NixlConnector"
	kv_role: "kv_producer"
	decode_config:
	model_loading_config:
	model_id: "qwen"
	model_source: "Qwen/Qwen3-0.6B"
	deployment_config:
	autoscaling_config:
	min_replicas: 1
	max_replicas: 2
	ray_actor_options:
	resources:
	decode_node: 1
	engine_kwargs:
	tensor_parallel_size: 1
	gpu_memory_utilization: 0.9
	dtype: "auto"
	max_num_seqs: 40
	max_model_len: 16384
	enable_chunked_prefill: true
	enable_prefix_caching: true
	kv_transfer_config:
	kv_connector: "NixlConnector"
	kv_role: "kv_consumer"
	rayClusterConfig:
	headGroupSpec:
	rayStartParams:
	num-gpus: "0"
	template:
	spec:
	containers:
	- name: ray-head
	image: rayproject/ray-llm:2.52.0-py311-cu128
	resources:
	limits:
	cpu: "4"
	memory: "32Gi"
	requests:
	cpu: "4"
	memory: "32Gi"
	ports:
	- containerPort: 6379
	name: gcs-server
	- containerPort: 8265
	name: dashboard
	- containerPort: 10001
	name: client
	- containerPort: 8000
	name: serve
	workerGroupSpecs:
	# Prefill worker group - handles prompt processing
	- groupName: prefill-group
	replicas: 1
	minReplicas: 1
	maxReplicas: 2
	rayStartParams:
	resources: '"{\"prefill_node\": 1}"'
	template:
	spec:
	containers:
	- name: ray-worker
	image: rayproject/ray-llm:2.52.0-py311-cu128
	resources:
	limits:
	cpu: "8"
	memory: "64Gi"
	nvidia.com/gpu: "1"
	requests:
	cpu: "8"
	memory: "64Gi"
	nvidia.com/gpu: "1"
	tolerations:
	- key: "nvidia.com/gpu"
	operator: "Exists"
	effect: "NoSchedule"
	# Decode worker group - handles token generation
	- groupName: decode-group
	replicas: 1
	minReplicas: 1
	maxReplicas: 2
	rayStartParams:
	resources: '"{\"decode_node\": 1}"'
	template:
	spec:
	containers:
	- name: ray-worker
	image: rayproject/ray-llm:2.52.0-py311-cu128
	resources:
	limits:
	cpu: "8"
	memory: "64Gi"
	nvidia.com/gpu: "1"
	requests:
	cpu: "8"
	memory: "64Gi"
	nvidia.com/gpu: "1"
	tolerations:
	- key: "nvidia.com/gpu"
	operator: "Exists"
	effect: "NoSchedule"
No results found