Marcell Biemann mbiemann

Glue PySpark Job

Connect to AWS usind SAML2AWS

saml2aws login --skip-prompt --disable-keychain

role=arn:aws:iam::xxx:role/xxx
credentials=$(aws sts assume-role --role-arn $role --role-session-name tmp --profile saml)
export AWS_ACCESS_KEY_ID=$(echo "$credentials" | grep -o '"AccessKeyId": "[^"]*' | cut -d '"' -f 4)

Useful Links

NoHello -> please don't say just hello in chat
Request Catcher -> Debug web hooks, http clients, etc.

Python Development Environment

This tutorial will install:

Pre-requirements

Using colima and Docker Engine on Apple Silicon (M1 Chip)

This tutorial uses Homebrew to install colima and Docker.

It was tested on Apple MacBook Pro (13-inch, M1, 2020) 8G and macOS Monterey version 12.1 (21C52).

Uninstall any Docker version

Make sure you have fully uninstall any versions of Docker. You can check using:

	bucket = 'bucket_name'
	prefix = 'path/folder/'
	file_meta = './local/folder_meta.json'
	file_detail = './local/folder_detail.json'
	file_partial = './local/folder_partial.json'
	stop_key = 'path/folder/partition9/filename9999.csv.gz'

	data_meta = {}
	data_detail = {}
	try:

	filter @message like '[INFO]'
	\| filter @message not like 'Found credentials in environment'
	\| fields @timestamp, @message, substr(@message, 0, 6) as msg_level, substr(@message, 7, 24) as msg_timestamp, substr(@message, 32, 33) as msg_uuid, substr(@message, 69) as msg_body
	\| parse msg_body '"event": "*"' as event_str
	\| parse msg_body '"event": {*}' as event_json
	\| parse event_json '"messageType": "*"' as messageType
	\| stats count(*) as qty by event_str, messageType
	\| sort qty desc

	import boto3

	print("check \|\| cluster_workes >= target \|\| cluster_workes >= stream_shards / shards_per_task \|\| cluster_workes >= stream_shards / (stream_shards / cluster_workes)")

	cluster_workes = int(spark.sparkContext.getConf().get("spark.databricks.clusterUsageTags.clusterWorkers"))
	stream_shards = boto3.client("kinesis").describe_stream_summary(StreamName=source_stream)["StreamDescriptionSummary"]["OpenShardCount"]
	shards_per_task = int(stream_shards / cluster_workes)
	if shards_per_task < 1:
	raise Exception(f"Sizing Error: Cluster Workers can't be {cluster_workes}. It must be up to {stream_shards}.")
	target = int(stream_shards / shards_per_task)

	import boto3
	import time

	redshift = boto3.client("redshift-data")

	def redshift_sql(query):
	resp = redshift.execute_statement(
	ClusterIdentifier="xxx",
	Database="xxx",
	DbUser="xxx",

	#!make
	MAKEFLAGS += --always-make

	MAKEFILE_VERSION = 0.0.2 2022-11-15 12:17 PM

	# ==============================================================================

	.EXPORT_ALL_VARIABLES:

	ifneq ($(wildcard ~/.makeconfig),)

	git checkout <target_branch>
	git diff <source_branch> --name-only \| cat
	git checkout <source> "file name"