sozercan · December 8, 2025 21:29
diff --git a/gistfile1.txt b/gistfile1.txt
 apiVersion: ray.io/v1
 kind: RayService
 metadata:
  name: qwen
 spec:
  serveConfigV2: |
    applications:
    - name: llm_app
      import_path: ray.serve.llm:build_openai_app
      route_prefix: "/"
      # Runtime env for the Serve app
      runtime_env:
        env_vars:
          VLLM_USE_V1: "1"

      # Args passed to build_openai_app
      args:
        llm_configs:
          - model_loading_config:
              model_id: "qwen"
              model_source: "Qwen/Qwen3-0.6B"
            accelerator_type: "A100"
            deployment_config:
              autoscaling_config:
                min_replicas: 1
                max_replicas: 2
            engine_kwargs:
              # 0.6B is tiny; single GPU is enough
              tensor_parallel_size: 1
              pipeline_parallel_size: 1
              gpu_memory_utilization: 0.9
              dtype: "auto"
              max_num_seqs: 40
              max_model_len: 16384
              enable_chunked_prefill: true
              enable_prefix_caching: true
  rayClusterConfig:
    headGroupSpec:
      rayStartParams:
        num-gpus: "0"
      template:
        spec:
          containers:
          - name: ray-head
            image: rayproject/ray-llm:2.52.0-py311-cu128
            resources:
              limits:
                cpu: "4"
                memory: "32Gi"
              requests:
                cpu: "4"
                memory: "32Gi"
            ports:
            - containerPort: 6379
              name: gcs-server
            - containerPort: 8265
              name: dashboard
            - containerPort: 10001
              name: client
            - containerPort: 8000
              name: serve
    workerGroupSpecs:
    - groupName: gpu-group
      replicas: 2
      minReplicas: 2
      maxReplicas: 2
      rayStartParams: {}
      template:
        spec:
          containers:
          - name: ray-worker
            image: rayproject/ray-llm:2.52.0-py311-cu128
            resources:
              # Adjust these to match your node flavor
              limits:
                cpu: "8"
                memory: "64Gi"
                nvidia.com/gpu: "1"
              requests:
                cpu: "8"
                memory: "64Gi"
                nvidia.com/gpu: "1"
          tolerations:
          - key: "nvidia.com/gpu"
            operator: "Exists"
            effect: "NoSchedule"
	apiVersion: ray.io/v1
	kind: RayService
	metadata:
	name: qwen
	spec:
	serveConfigV2: \|
	applications:
	- name: llm_app
	import_path: ray.serve.llm:build_openai_app
	route_prefix: "/"
	# Runtime env for the Serve app
	runtime_env:
	env_vars:
	VLLM_USE_V1: "1"

	# Args passed to build_openai_app
	args:
	llm_configs:
	- model_loading_config:
	model_id: "qwen"
	model_source: "Qwen/Qwen3-0.6B"
	accelerator_type: "A100"
	deployment_config:
	autoscaling_config:
	min_replicas: 1
	max_replicas: 2
	engine_kwargs:
	# 0.6B is tiny; single GPU is enough
	tensor_parallel_size: 1
	pipeline_parallel_size: 1
	gpu_memory_utilization: 0.9
	dtype: "auto"
	max_num_seqs: 40
	max_model_len: 16384
	enable_chunked_prefill: true
	enable_prefix_caching: true
	rayClusterConfig:
	headGroupSpec:
	rayStartParams:
	num-gpus: "0"
	template:
	spec:
	containers:
	- name: ray-head
	image: rayproject/ray-llm:2.52.0-py311-cu128
	resources:
	limits:
	cpu: "4"
	memory: "32Gi"
	requests:
	cpu: "4"
	memory: "32Gi"
	ports:
	- containerPort: 6379
	name: gcs-server
	- containerPort: 8265
	name: dashboard
	- containerPort: 10001
	name: client
	- containerPort: 8000
	name: serve
	workerGroupSpecs:
	- groupName: gpu-group
	replicas: 2
	minReplicas: 2
	maxReplicas: 2
	rayStartParams: {}
	template:
	spec:
	containers:
	- name: ray-worker
	image: rayproject/ray-llm:2.52.0-py311-cu128
	resources:
	# Adjust these to match your node flavor
	limits:
	cpu: "8"
	memory: "64Gi"
	nvidia.com/gpu: "1"
	requests:
	cpu: "8"
	memory: "64Gi"
	nvidia.com/gpu: "1"
	tolerations:
	- key: "nvidia.com/gpu"
	operator: "Exists"
	effect: "NoSchedule"
No results found