Last active
November 21, 2025 08:37
-
-
Save huksley/62dfcda8fe010571c7edfe5546a9f3f8 to your computer and use it in GitHub Desktop.
Deploy B300, 200 or H200 clusters via Verda Cloud (formerly DataCrunch) API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| if [ $# -lt 1 ]; then | |
| echo "usage: $0 ENVIRONMENT" >&2 | |
| exit 1 | |
| fi | |
| ENVIRONMENT="$1" | |
| CURL_ARGS=(-s --fail-with-body -w '%{onerror}%{stderr}\ncurl: %{method} %{url_effective}: %{errormsg} (%{response_code})\n') | |
| if [ ! -f "$(dirname "$0")/.env.$ENVIRONMENT" ]; then | |
| echo "Environment file $(dirname "$0")/.env.$ENVIRONMENT not found" | |
| exit 1 | |
| fi | |
| source "$(dirname "$0")/.env.$ENVIRONMENT" | |
| # make curl terse on success, but produce more info on errors | |
| CURL_ARGS=(-s --fail-with-body -w '%{onerror}%{stderr}\ncurl: %{method} %{url_effective}: %{errormsg} (%{response_code})\n') | |
| TOKEN="" | |
| TOKEN_CREATED=0 | |
| TOKEN_EXPIRES_IN=0 | |
| function get_token() { | |
| if [ $TOKEN_EXPIRES_IN -gt 0 ] && [ $TOKEN_CREATED -gt $(($(date +%s) - $TOKEN_EXPIRES_IN)) ]; then | |
| # Token is still valid, using existing token | |
| return | |
| fi | |
| TOKEN_RESPONSE=$(curl "${CURL_ARGS[@]}" -X POST "$DATACRUNCH_API/v1/oauth2/token" -H "Content-type: application/json" -d "{\"grant_type\": \"client_credentials\", \"client_id\": \"$DATACRUNCH_CLIENT_ID\", \"client_secret\": \"$DATACRUNCH_CLIENT_SECRET\" }") | |
| export TOKEN=$(echo "$TOKEN_RESPONSE" | jq -r .access_token) | |
| export TOKEN_EXPIRES_IN=$(echo "$TOKEN_RESPONSE" | jq -r .expires_in) | |
| export TOKEN_CREATED=$(date +%s) | |
| echo "Got API access token, expires in $TOKEN_EXPIRES_IN seconds" | |
| } | |
| get_token | |
| SSH_KEYS=$(curl "${CURL_ARGS[@]}" -H "Authorization: Bearer $TOKEN" "$DATACRUNCH_API/v1/sshkeys" | jq '[.[].id]') | |
| if [ "$SSH_KEYS" = "[]" ]; then | |
| echo "No SSH keys found" | |
| exit 1 | |
| fi | |
| NUM=$((RANDOM % 1000000)) | |
| CLUSTER_TYPE=${CLUSTER_TYPE:-"16H200"} | |
| CLUSTER_IMAGE=${CLUSTER_IMAGE:-""} | |
| DATACRUNCH_LOCATION=${DATACRUNCH_LOCATION:-"FIN-03"} | |
| CLUSTER_NAME=${CLUSTER_NAME:-"test-cluster-$NUM"} | |
| if [ -z "$CLUSTER_IMAGE" ]; then | |
| # Find first suitable cluster image | |
| CLUSTER_IMAGE=$(curl "${CURL_ARGS[@]}" "$DATACRUNCH_API/v1/images/cluster" -H "Authorization: Bearer $TOKEN" | jq -r "[.[] | select(.is_cluster == true)][0] | .image_type") | |
| if [ -z "$CLUSTER_IMAGE" ]; then | |
| echo "No cluster images found" | |
| exit 1 | |
| fi | |
| fi | |
| # Check that cluster type exists | |
| CLUSTER_TYPES_RESPONSE=$(curl "${CURL_ARGS[@]}" "$DATACRUNCH_API/v1/cluster-types" -H "Authorization: Bearer $TOKEN" | jq ".[] | select(.cluster_type == \"$CLUSTER_TYPE\").name") | |
| if [ -z "$CLUSTER_TYPES_RESPONSE" ]; then | |
| echo "Cluster type $CLUSTER_TYPE does not exist" | |
| exit 1 | |
| fi | |
| # Check if cluster type is available in the location | |
| AVAILABILITY=$(curl "${CURL_ARGS[@]}" "$DATACRUNCH_API/v1/cluster-availability/$CLUSTER_TYPE?location_code=$DATACRUNCH_LOCATION" -H "Authorization: Bearer $TOKEN") | |
| if [ "$AVAILABILITY" = "false" ]; then | |
| echo "Cluster $CLUSTER_TYPE is not available in $DATACRUNCH_LOCATION" | |
| exit 1 | |
| fi | |
| PAYLOAD=$(jq -n \ | |
| --argjson SSH_KEYS "$SSH_KEYS" \ | |
| --arg CLUSTER_NAME "$CLUSTER_NAME" \ | |
| --arg SHARED_VOLUME_NAME "test-cluster-shared-volume-$NUM" \ | |
| --arg CLUSTER_TYPE "$CLUSTER_TYPE" \ | |
| --arg CLUSTER_IMAGE "$CLUSTER_IMAGE" \ | |
| --arg DATACRUNCH_LOCATION "$DATACRUNCH_LOCATION" \ | |
| '{ | |
| "cluster_type": $CLUSTER_TYPE, | |
| "image": $CLUSTER_IMAGE, | |
| "ssh_key_ids": $SSH_KEYS, | |
| "hostname": $CLUSTER_NAME, | |
| "description": "Created by a ./tools/deploy-cluster script", | |
| "location_code": $DATACRUNCH_LOCATION, | |
| "shared_volume": { | |
| "name": $SHARED_VOLUME_NAME, | |
| "size": 30000 | |
| } | |
| }') | |
| echo "Creating new cluster..." | |
| RESPONSE=$(curl "${CURL_ARGS[@]}" -X POST "$DATACRUNCH_API/v1/clusters" \ | |
| -H "Authorization: Bearer $TOKEN" \ | |
| -H 'Content-Type: application/json' \ | |
| -d "$PAYLOAD") | |
| CLUSTER_ID=$(echo "$RESPONSE" | jq -r '.id') | |
| if [ -z "$CLUSTER_ID" ] || [ "$CLUSTER_ID" = "null" ]; then | |
| echo "Failed to create cluster. Response: $RESPONSE" | |
| exit 1 | |
| fi | |
| echo "Cluster created with ID: $CLUSTER_ID" | |
| echo "" | |
| echo "Fetching cluster details..." | |
| curl "${CURL_ARGS[@]}" -H "Authorization: Bearer $TOKEN" "$DATACRUNCH_API/v1/clusters/$CLUSTER_ID" | jq '.' | |
| echo "" | |
| echo "Waiting for cluster to be running (up to 10 minutes)..." | |
| MAX_WAIT_SECONDS=${MAX_WAIT_SECONDS:-1200} | |
| CHECK_INTERVAL=2 | |
| START_TIME=$(date +%s) | |
| SHARED_VOLUME_ID="" | |
| while true; do | |
| ELAPSED=$(($(date +%s) - START_TIME)) | |
| if [ $ELAPSED -ge $MAX_WAIT_SECONDS ]; then | |
| echo "Timeout: Cluster $CLUSTER_ID did not reach running status within 10 minutes" | |
| exit 1 | |
| fi | |
| get_token | |
| CLUSTER_RESPONSE=$(curl "${CURL_ARGS[@]}" -H "Authorization: Bearer $TOKEN" "$DATACRUNCH_API/v1/clusters/$CLUSTER_ID" 2>/dev/null || echo "") | |
| if [ -z "$CLUSTER_RESPONSE" ]; then | |
| echo " [${ELAPSED}s] Failed to fetch cluster status, retrying..." | |
| sleep $CHECK_INTERVAL | |
| continue | |
| fi | |
| STATUS=$(echo "$CLUSTER_RESPONSE" | jq -r '.status' 2>/dev/null || echo "") | |
| SHARED_VOLUME_ID=$(echo "$CLUSTER_RESPONSE" | jq -r '.shared_volumes[0].id') | |
| if [ "$STATUS" = "running" ]; then | |
| echo " [${ELAPSED}s] Cluster $CLUSTER_NAME ($CLUSTER_ID) is now running ✓" | |
| break | |
| elif [ "$STATUS" = "error" ] || [ "$STATUS" = "notfound" ]; then | |
| echo " Error: Cluster $CLUSTER_NAME ($CLUSTER_ID) is in status: $STATUS" | |
| exit 1 | |
| else | |
| echo " [${ELAPSED}s] Cluster $CLUSTER_NAME ($CLUSTER_ID) status: $STATUS (waiting for running...)" | |
| sleep $CHECK_INTERVAL | |
| fi | |
| done | |
| # Comment this to automatically remove the cluster afterwards (for testing) | |
| exit 0 | |
| echo "Press Enter to discontinue the cluster (or wait 20 seconds or Ctrl-C to cancel)..." | |
| for _ in {1..20}; do read -rs -n1 -t1 || printf ".";done;echo | |
| echo "" | |
| echo "Discontinuing cluster..." | |
| while [[ -z "$SHARED_VOLUME_ID" || "$SHARED_VOLUME_ID" == "null" ]]; do | |
| sleep $CHECK_INTERVAL | |
| get_token | |
| CLUSTER_RESPONSE=$(curl "${CURL_ARGS[@]}" -H "Authorization: Bearer $TOKEN" "$DATACRUNCH_API/v1/clusters/$CLUSTER_ID" 2>/dev/null || echo "") | |
| SHARED_VOLUME_ID=$(echo "$CLUSTER_RESPONSE" | jq -r '.shared_volumes[0].id') | |
| echo "Shared volume ID: $SHARED_VOLUME_ID" | |
| echo "Cluster response: $CLUSTER_RESPONSE" | |
| done | |
| ACTIONS_JSON=$(jq -n --arg CLUSTER_ID "$CLUSTER_ID" '{ "actions": [{ "action": "discontinue", "id": $CLUSTER_ID }] }') | |
| RESPONSE=$(curl "${CURL_ARGS[@]}" -X PUT "$DATACRUNCH_API/v1/clusters" \ | |
| -H "Authorization: Bearer $TOKEN" \ | |
| -H "Content-Type: application/json" \ | |
| -d "$ACTIONS_JSON") | |
| echo "Discontinue request submitted successfully" | |
| read -p "Press Enter to permanently delete the shared volume (or Ctrl-C to cancel)..." | |
| if [[ -n "$SHARED_VOLUME_ID" && "$SHARED_VOLUME_ID" != "null" ]]; then | |
| echo "Soft deleting shared volume ($SHARED_VOLUME_ID)..." | |
| curl "$DATACRUNCH_API/v1/volumes/$SHARED_VOLUME_ID" \ | |
| --request DELETE \ | |
| -H "Authorization: Bearer $TOKEN" \ | |
| -H "Content-Type: application/json" \ | |
| -d '{ "is_permanent": false }' | |
| echo "Shared volume now in trash (manual remove needed, it still being billed $$$)" | |
| fi | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Put it as .env.<name> | |
| DATACRUNCH_CLIENT_ID=REPLACE | |
| DATACRUNCH_CLIENT_SECRET=REPLACE | |
| DATACRUNCH_API=https://api.datacrunch.io | |
| DATACRUNCH_LOCATION=FIN-03 | |
| #CLUSTER_TYPE=16B200 | |
| #CLUSTER_IMAGE="" | |
| #CLUSTER_NAME="my-cluster1" | |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This allows you deploy B300, B200 clusters fully automatically using Verda (formerly DataCrunch API)
Verda allows you create clusters with 16x B300 and more GPU.
Learn more about Nvidia B300 https://datacrunch.io/b300
Usage
Create new .env. file, by creating API Key in Verda Console
Run
bash deploy-cluster.sh <env>to create the cluster.Troubleshooting