Created
August 8, 2024 18:54
-
-
Save aamir814/d7f139a9110ce38eee2356159c0e8d51 to your computer and use it in GitHub Desktop.
Transfer Data (CDC) from Cloud SQL (MySql) with Private IP to BigQuery using Datastream - Demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| ##################################################################### | |
| # Purpose: Following steps show how to connect datastream to a Cloud | |
| # SQL instance with private IP. These steps can be used to | |
| # quickly demo data transfer from Cloud SQL to BigQuery. | |
| # | |
| # Assumption: | |
| # * Google cloud project exists with network setup as desired. | |
| # * all the necessary privileges have been granted to create | |
| # VM, Cloud SQL and datastream | |
| # | |
| # REFERENCES | |
| # - https://cloud.google.com/vpc/docs/create-modify-vpc-networks | |
| # - https://cloud.google.com/sql/docs/mysql/configure-private-services-access | |
| # - https://cloud.google.com/sql/docs/mysql/configure-private-ip | |
| # - https://cloud.google.com/iam/docs/service-accounts-create | |
| # - https://cloud.google.com/compute/docs/instances/startup-scripts/linux | |
| # - https://cloud.google.com/storage/docs/discover-object-storage-gcloud | |
| # - https://cloud.google.com/datastream/docs/quickstart-replication-to-bigquery | |
| # - https://cloud.google.com/datastream/docs/private-connectivity | |
| # - https://cloud.google.com/sdk/gcloud/reference/datastream/streams/create | |
| # - https://github.com/datacharmer/test_db | |
| # - https://gist.github.com/mikesparr/ebea5a7a490a0a7668ba87b07db31338 | |
| ##################################################################### | |
| export PROJECT_ID=$(gcloud config get-value project) | |
| export REGION=us-central1 | |
| export GCP_ZONE="us-central1-a" | |
| export NETWORK_NAME=default | |
| # enable apis | |
| gcloud services enable compute.googleapis.com \ | |
| servicenetworking.googleapis.com \ | |
| datastream.googleapis.com \ | |
| sqladmin.googleapis.com \ | |
| datastream.googleapis.com \ | |
| iap.googleapis.com | |
| ############################################################# | |
| # CREATE CLOUD SQL DATABASE (MYSQL) | |
| ############################################################# | |
| export DB_INSTANCE_NAME=mysql8a | |
| export DB_VERSION=MYSQL_8_0 | |
| export DB_CPU="2" | |
| export DB_MEMORY="4GB" | |
| export DS_USER=datastream | |
| export DS_USER_PASSWORD=$(openssl rand -base64 32) # random | |
| gcloud beta sql instances create $DB_INSTANCE_NAME \ | |
| --cpu=$DB_CPU \ | |
| --memory=$DB_MEMORY \ | |
| --region=$REGION \ | |
| --no-assign-ip \ | |
| --enable-google-private-path \ | |
| --network=$NETWORK_NAME \ | |
| --database-version=$DB_VERSION \ | |
| --enable-bin-log \ | |
| --no-deletion-protection \ | |
| --enable-google-private-path \ | |
| --edition=enterprise \ | |
| --database-flags=net_read_timeout=3600,net_write_timeout=3600,wait_timeout=86400 | |
| # lock down mysql default (root) user [manually input at prompt] | |
| gcloud sql users set-password root \ | |
| --host=% \ | |
| --instance=$DB_INSTANCE_NAME \ | |
| --prompt-for-password | |
| # create datastream user | |
| gcloud sql users create $DS_USER \ | |
| --instance=$DB_INSTANCE_NAME \ | |
| --host='%' \ | |
| --password $DS_USER_PASSWORD | |
| # create database | |
| gcloud sql databases create employees \ | |
| --instance=$DB_INSTANCE_NAME | |
| ############################################################# | |
| # DATASTREAM NETWORK PROXY (GCE INSTANCE) | |
| ############################################################# | |
| export VM_NAME=reverse-proxya | |
| export VM_IMAGE_NAME=debian-12 | |
| export VM_IMAGE_PROJECT=debian-cloud | |
| # *** IMPORTANT *** | |
| # There are two options on creating VM for Datastream Proxy. | |
| # Please choose only one of the two options below | |
| # | |
| # OPTION 1: | |
| # Use startup script to create iptables port forwarding rules | |
| # This is the simple setup for a single database. For multiple | |
| # Databases, you'll need to modify the startup script and add | |
| # IP addresses and unique ports for each database | |
| # | |
| # OPTION 2: | |
| # Use cloud-sql-proxy to connect one or more Cloud SQL instances. | |
| # Need unique ports for each Cloud SQL instance. | |
| # You'll need a service account and credential file. Also, for | |
| # Production, the cloud-sql-proxy command needs to be added as | |
| # startup service. | |
| ############################################################# | |
| # OPTION 1: | |
| ############################################################# | |
| export DB_PORT=3306 # for MySQL | |
| export STARTUP_SCRIPT="reverse_proxy_sp.sh" | |
| # get DB private IP address | |
| export DB_ADDR=$(gcloud sql instances describe $DB_INSTANCE_NAME --format='value(ipAddresses.ipAddress)') | |
| # create startup script | |
| cat << EOF > $STARTUP_SCRIPT | |
| #! /bin/bash | |
| export DB_ADDR=$DB_ADDR | |
| export DB_PORT=$DB_PORT | |
| export ETH_NAME=\$(ip -o link show | awk -F': ' '{print \$2}' | grep -v lo) | |
| export LOCAL_IP_ADDR=\$(ip -4 addr show \$ETH_NAME | grep -Po 'inet \K[\d.]+') | |
| echo 1 > /proc/sys/net/ipv4/ip_forward | |
| iptables -t nat -A PREROUTING -p tcp -m tcp --dport \$DB_PORT -j DNAT \ | |
| --to-destination \$DB_ADDR:\$DB_PORT | |
| iptables -t nat -A POSTROUTING -j SNAT --to-source \$LOCAL_IP_ADDR | |
| EOF | |
| # create VM | |
| gcloud compute instances create $VM_NAME \ | |
| --project=$PROJECT_ID \ | |
| --zone=$GCP_ZONE \ | |
| --machine-type=e2-micro \ | |
| --network=$NETWORK_NAME \ | |
| --image-family=$VM_IMAGE_NAME \ | |
| --image-project=$VM_IMAGE_PROJECT \ | |
| --no-address \ | |
| --metadata-from-file=startup-script=$STARTUP_SCRIPT | |
| # verify startup script was run without any errors | |
| gcloud compute ssh $VM_NAME \ | |
| -- "sudo journalctl -u google-startup-scripts.service" | |
| # add DNS entry to connect to internet because iptables rules in | |
| # startup script will prevent access to internet and you | |
| # won't be able to install update and MySql client. | |
| # restart of the instance will overwrite the resolv.conf file. | |
| gcloud compute ssh $VM_NAME \ | |
| -- "sudo sh -c 'echo "nameserver 8.8.8.8" > /etc/resolv.conf'" | |
| # get updates and install MySQL Client | |
| gcloud compute ssh $VM_NAME \ | |
| -- "sudo apt update && sudo apt install -y netcat-traditional && sudo apt install -y default-mysql-client" | |
| # test to make sure VM can connect to Cloud SQL instance | |
| gcloud compute ssh $VM_NAME \ | |
| -- "nc -zv $DB_ADDR $DB_PORT" | |
| ############################################################# | |
| # OPTION 2: | |
| # USE CLOUD SQL PROXY - this will allow you to use multiple databases on a single VM | |
| # This will requier Service account credential file and list of unique ports for | |
| # each Cloud SQL instance. | |
| ############################################################# | |
| export DB_PORT=6000 # for MySQL | |
| export CONN_NAME=$(gcloud sql instances describe $DB_INSTANCE_NAME --format="value(connectionName)") | |
| # set DB_ADDR to Local host for this option | |
| export DB_ADDR="127.0.0.1" | |
| # create service account | |
| export SA_CSQL_PROXY_NAME="sa-csql-proxy" | |
| # create service account | |
| gcloud iam service-accounts create $SA_CSQL_PROXY_NAME \ | |
| --description="Cloud SQL proxy SA" \ | |
| --display-name="$SA_CSQL_PROXY_NAME" | |
| # get service account email | |
| export SA_CSQL_PROXY_EMAIL=$(gcloud iam service-accounts list --filter="NAME:$SA_CSQL_PROXY_NAME" --format="value(EMAIL)") | |
| # grant service account Cloud SQL Client role | |
| gcloud projects add-iam-policy-binding $PROJECT_ID \ | |
| --member="serviceAccount:$SA_CSQL_PROXY_EMAIL" \ | |
| --role='roles/cloudsql.client' | |
| # create service account crednetial file. File will be downloaded in the current dir. | |
| gcloud iam service-accounts keys create $SA_CSQL_PROXY_NAME.json \ | |
| --iam-account=$SA_CSQL_PROXY_EMAIL | |
| # create VM - change machine type as desired | |
| gcloud compute instances create $VM_NAME \ | |
| --project=$PROJECT_ID \ | |
| --zone=$GCP_ZONE \ | |
| --machine-type=e2-micro \ | |
| --network=$NETWORK_NAME \ | |
| --image-family=$VM_IMAGE_NAME \ | |
| --image-project=$VM_IMAGE_PROJECT \ | |
| --no-address | |
| # install mysql client | |
| gcloud compute ssh $VM_NAME \ | |
| -- "sudo apt update && sudo apt install -y netcat-traditional && sudo apt install -y default-mysql-client" | |
| # test connection between VM and Cloud SQL instance | |
| gcloud compute ssh $VM_NAME \ | |
| -- "nc -zv $DB_ADDR 3306" | |
| # download cloud-sql-proxy | |
| gcloud compute ssh $VM_NAME \ | |
| -- "curl -o cloud-sql-proxy https://storage.googleapis.com/cloud-sql-connectors/cloud-sql-proxy/v2.12.0/cloud-sql-proxy.linux.amd64 && chmod +x cloud-sql-proxy" | |
| # copy credential file to VM | |
| gcloud compute scp $SA_CSQL_PROXY_NAME.json $VM_NAME:~ | |
| # run cloud-sql-proxy | |
| # for multiple databases add more connection like this: | |
| #./cloud-sql-proxy \ | |
| # 'aamir-playground:us-central1:mysql8?port=6000' \ | |
| # 'aamir-playground:us-central1:mysql8a?port=6001' \ | |
| # --credentials-file=aamir-playground-3b0ff3ae453d.json \ | |
| # --private-ip \ | |
| # --address 0.0.0.0 | |
| # backgroun the proxy command | |
| # DO NOT DO THIS IN PRODUCTION! | |
| nohup gcloud compute ssh $VM_NAME \ | |
| -- "nohup ./cloud-sql-proxy \ | |
| '$CONN_NAME?port=$DB_PORT' \ | |
| --credentials-file=$SA_CSQL_PROXY_NAME.json \ | |
| --private-ip \ | |
| --address 0.0.0.0 " & | |
| # run bg (background) command to resume the command in the backgroun | |
| bg | |
| # test connectivity using cloud-sql-proxy | |
| gcloud compute ssh $VM_NAME \ | |
| -- "mysql -h $DB_ADDR -P $DB_PORT -u root -p << EOF | |
| show databases; | |
| EOF" | |
| # TODO: how to add cloud-sql-proxy in startup script or as a service | |
| ############################################################# | |
| # once VM has been created, we can connect to database and | |
| # grant datastream user privileges | |
| gcloud compute ssh $VM_NAME \ | |
| -- "mysql -h $DB_ADDR -P $DB_PORT -u root -p << EOF | |
| GRANT REPLICATION SLAVE, SELECT, REPLICATION CLIENT ON *.* TO '$DS_USER'@'%'; | |
| FLUSH PRIVILEGES; | |
| select host, user, select_priv, Repl_slave_priv , Repl_client_priv from mysql.user | |
| where user = '$DS_USER'; | |
| EOF" | |
| ############################################################# | |
| # DATASTREAM | |
| ############################################################# | |
| export DS_PRIVATE_CONN_NAME="demo-privateconnection" | |
| export DS_CONN_PROFILE_MYSQL="ds-mysql-profile" | |
| export DS_CONN_PROFILE_BQ="ds-bq-profile" | |
| export DS_SOURCE_CONFIG_FILE="source_config.json" | |
| export DS_DEST_CONFIG_FILE="dest_config.json" | |
| export DS_STREAM_NAME="employees-stream" | |
| export BQ_DATA_FRESHNESS="300s" | |
| export DS_PRIVATE_CONN_RANGE="10.15.0.0/29" #change as needed | |
| # get VM Private IP address | |
| export VM_PRIVATE_IP=$(gcloud compute instances describe $VM_NAME --format='value(networkInterfaces.networkIP)') | |
| # add Datastream range to Firewall rules. | |
| gcloud compute firewall-rules create allow-private-connectivity \ | |
| --direction=INGRESS \ | |
| --priority=1000 \ | |
| --network=$NETWORK_NAME \ | |
| --project=$PROJECT_ID \ | |
| --action=ALLOW \ | |
| --rules=TCP \ | |
| --source-ranges=$DS_PRIVATE_CONN_RANGE | |
| # create DS private connections - this can take few mins | |
| # please wait for private connection to be completed before | |
| # moving forward. | |
| gcloud datastream private-connections create $DS_PRIVATE_CONN_NAME \ | |
| --location=$REGION \ | |
| --display-name=$DS_PRIVATE_CONN_NAME \ | |
| --vpc=$NETWORK_NAME \ | |
| --subnet=$DS_PRIVATE_CONN_RANGE | |
| # Check the status of private connections | |
| # STATE column should show "CREATED" before proceeding | |
| gcloud datastream private-connections list --location=$REGION | |
| # create MySQL connection profile using VM private IP | |
| gcloud datastream connection-profiles create $DS_CONN_PROFILE_MYSQL \ | |
| --location=$REGION \ | |
| --type=mysql \ | |
| --mysql-password=$DS_USER_PASSWORD \ | |
| --mysql-username=$DS_USER \ | |
| --display-name=$DS_CONN_PROFILE_MYSQL \ | |
| --mysql-hostname=$VM_PRIVATE_IP \ | |
| --mysql-port=$DB_PORT \ | |
| --private-connection=$DS_PRIVATE_CONN_NAME | |
| # create BigQuery connction profile | |
| gcloud datastream connection-profiles create $DS_CONN_PROFILE_BQ \ | |
| --location=$REGION \ | |
| --type=bigquery \ | |
| --display-name=$DS_CONN_PROFILE_BQ | |
| # verify connection profiles were created | |
| gcloud datastream connection-profiles list --location=$REGION | |
| ## create source config file | |
| cat << EOF > $DS_SOURCE_CONFIG_FILE | |
| { | |
| "excludeObjects": {}, | |
| "includeObjects": { | |
| "mysqlDatabases": [ | |
| { | |
| "database":"employees" | |
| } | |
| ] | |
| } | |
| } | |
| EOF | |
| cat << EOF > $DS_DEST_CONFIG_FILE | |
| { | |
| "sourceHierarchyDatasets": { | |
| "datasetTemplate": { | |
| "location": "$REGION" | |
| } | |
| }, | |
| "merge": {}, | |
| "dataFreshness": "$BQ_DATA_FRESHNESS" | |
| } | |
| EOF | |
| gcloud datastream streams create $DS_STREAM_NAME \ | |
| --location=$REGION \ | |
| --display-name=$DS_STREAM_NAME \ | |
| --source=$DS_CONN_PROFILE_MYSQL \ | |
| --mysql-source-config=$DS_SOURCE_CONFIG_FILE \ | |
| --destination=$DS_CONN_PROFILE_BQ \ | |
| --bigquery-destination-config=$DS_DEST_CONFIG_FILE \ | |
| --backfill-all | |
| # Start the datastream | |
| export TOKEN=$(gcloud auth print-access-token) | |
| export UPDATE="{\"state\":\"RUNNING\"}" | |
| curl -X PATCH "https://datastream.googleapis.com/v1/projects/$PROJECT_ID/locations/$REGION/streams/$DS_STREAM_NAME?updateMask=state" \ | |
| -d "$UPDATE" \ | |
| -H "Authorization: Bearer $TOKEN" \ | |
| -H "Content-Type: application/json" | |
| # check status of the stream | |
| curl "https://datastream.googleapis.com/v1/projects/$PROJECT_ID/locations/$REGION/streams/$DS_STREAM_NAME" \ | |
| -H "Authorization: Bearer $TOKEN" \ | |
| -H "Content-Type: application/json" | |
| ############################################################# | |
| # TEST DATA LOADING | |
| ############################################################# | |
| gcloud compute ssh $VM_NAME \ | |
| -- "sudo apt install -y git && git clone https://github.com/datacharmer/test_db.git" | |
| gcloud compute ssh $VM_NAME \ | |
| -- "cd test_db && mysql -h $DB_ADDR -P $DB_PORT -u root -p < employees.sql" | |
| # verify that objects have been added and backfill has started | |
| gcloud datastream objects list --stream=$DS_STREAM_NAME --location=$REGION | |
| ############################################################# | |
| # CLEANUP | |
| ############################################################# | |
| # delete datastream | |
| gcloud datastream streams delete $DS_STREAM_NAME --location=$REGION | |
| # remove config files | |
| rm $DS_SOURCE_CONFIG_FILE | |
| rm $DS_DEST_CONFIG_FILE | |
| rm $STARTUP_SCRIPT | |
| rm $SA_CSQL_PROXY_NAME.json | |
| # delect connection profiles | |
| gcloud datastream connection-profiles delete $DS_CONN_PROFILE_MYSQL --location=$REGION | |
| gcloud datastream connection-profiles delete $DS_CONN_PROFILE_BQ --location=$REGION | |
| # delete private connection (might need to run this twice) | |
| gcloud datastream private-connections delete $DS_PRIVATE_CONN_NAME --location=$REGION --force | |
| # delete Cloud SQL instance | |
| gcloud sql instances delete $DB_INSTANCE_NAME --quiet | |
| # delete VM | |
| gcloud compute instances delete $VM_NAME --zone=$GCP_ZONE | |
| # delete dataset from BQ | |
| bq rm -r -d employees | |
| # delete service account | |
| gcloud iam service-accounts delete $SA_CSQL_PROXY_EMAIL | |
| # delete Firewall rule | |
| gcloud compute firewall-rules delete allow-private-connectivity | |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Datastream CDC with MySQL and BigQuery
Since a few customers have asked similar questions about connecting Datastream to Cloud SQL with a private IP address, I decided to write this gist to help others. The above steps can be used to create a quick Demo or POC showing how Datastream can connect to Cloud SQL with a private IP address.
Results
Here are a few screenshots:
Datastream Private Connection

Stream Running

Command line:
Console:
Datasets in BigQuery
