Skip to content

Instantly share code, notes, and snippets.

@Aaronontheweb
Last active August 26, 2025 21:32
Show Gist options
  • Select an option

  • Save Aaronontheweb/3d9862f7b34a7c78bd9e4a4bf3546f32 to your computer and use it in GitHub Desktop.

Select an option

Save Aaronontheweb/3d9862f7b34a7c78bd9e4a4bf3546f32 to your computer and use it in GitHub Desktop.
Akka.NET Cluster Client Health Check Solution - Comprehensive health checks for detecting quarantine and client termination

Cluster Client Health Check Solution

This solution addresses the quarantine detection problem for Akka.NET Cluster Clients using the new Akka.Hosting health check features (v1.5.47-beta1).

Problem Statement

Cluster clients can become quarantined by main cluster nodes, making communication impossible until manually restarted. This solution provides automatic detection of quarantine states.

Solution Components

1. QuarantineDetectorActor

  • Subscribes to ThisActorSystemQuarantinedEvent from the EventStream
  • Maintains quarantine state
  • Responds to health check queries

2. QuarantineHealthCheck

  • Implements IHealthCheck interface
  • Queries the QuarantineDetectorActor via Ask pattern
  • Returns unhealthy status when system is quarantined

3. Extension Methods

  • WithQuarantineHealthCheck(): Easy setup for quarantine detection
  • WithClusterClientHealthCheck(): Combines quarantine detection with ClusterClient configuration

Usage

builder.Services.AddAkka("ClusterClientSystem", (configurationBuilder, provider) =>
{
    configurationBuilder
        .WithRemoting("localhost", 0)
        .WithClustering()
        .WithClusterClient(contactPoints)
        // Add quarantine health check with one line
        .WithQuarantineHealthCheck()
        // Or use the cluster client specific version
        .WithClusterClientHealthCheck(reconnectTimeout: TimeSpan.FromMinutes(5));
});

Kubernetes Integration

The sample includes three health check endpoints:

  1. /health - Liveness probe (returns 503 when quarantined)
  2. /health/all - Detailed health status with all checks
  3. /health/ready - Readiness probe

Kubernetes Configuration Example

livenessProbe:
  httpGet:
    path: /health
    port: 80
  initialDelaySeconds: 30
  periodSeconds: 10
  failureThreshold: 3

readinessProbe:
  httpGet:
    path: /health/ready
    port: 80
  initialDelaySeconds: 5
  periodSeconds: 5

When the health check detects quarantine, Kubernetes will:

  1. Mark the pod as unhealthy
  2. Stop routing traffic (readiness fails)
  3. Restart the pod after failureThreshold is reached (liveness fails)

Additional Features

Reconnect Timeout

Configure ClusterClient to terminate after failing to reconnect:

.WithClusterClientHealthCheck(reconnectTimeout: TimeSpan.FromMinutes(5))

This ensures the ClusterClient actor terminates if it can't reconnect within 5 minutes, allowing for detection via DeathWatch.

Testing

Run the application and access:

Next Steps for Customer

  1. Replace contact points with actual cluster addresses
  2. Deploy to Kubernetes with health check probes configured
  3. Monitor health check endpoints to verify quarantine detection
  4. Consider adding alerts based on health check failures
  5. Test by inducing quarantine conditions in staging environment
using Akka.Actor;
using Akka.Cluster.Tools.Client;
using Akka.Event;
using Akka.Hosting;
using Akka.Remote;
using Microsoft.Extensions.Diagnostics.HealthChecks;
namespace ClusterClientHealthCheck;
public class QuarantineDetectorActor : ReceiveActor
{
private bool _isQuarantined;
public QuarantineDetectorActor()
{
// Subscribe to quarantine events
Context.System.EventStream.Subscribe(Self, typeof(ThisActorSystemQuarantinedEvent));
Receive<ThisActorSystemQuarantinedEvent>(quarantineEvent =>
{
_isQuarantined = true;
Context.GetLogger().Warning($"System quarantined by {quarantineEvent.RemoteAddress}");
});
Receive<GetQuarantineStatus>(_ => { Sender.Tell(new QuarantineStatus(_isQuarantined)); });
}
public static Props Props()
{
return Akka.Actor.Props.Create<QuarantineDetectorActor>();
}
}
public class ClusterClientMonitorActor : ReceiveActor
{
private readonly IActorRef _clusterClient;
private bool _isAlive = true;
public ClusterClientMonitorActor(IActorRef clusterClient)
{
_clusterClient = clusterClient;
// Watch the cluster client
Context.Watch(_clusterClient);
Receive<Terminated>(terminated =>
{
if (terminated.ActorRef.Equals(_clusterClient))
{
_isAlive = false;
Context.GetLogger().Warning("ClusterClient terminated");
}
});
Receive<GetClusterClientStatus>(_ => { Sender.Tell(new ClusterClientStatus(_isAlive)); });
}
public static Props Props(IActorRef clusterClient)
{
return Akka.Actor.Props.Create(() => new ClusterClientMonitorActor(clusterClient));
}
}
public sealed class GetQuarantineStatus
{
public static readonly GetQuarantineStatus Instance = new();
private GetQuarantineStatus()
{
}
}
public sealed class QuarantineStatus
{
public QuarantineStatus(bool isQuarantined)
{
IsQuarantined = isQuarantined;
}
public bool IsQuarantined { get; }
}
public sealed class GetClusterClientStatus
{
public static readonly GetClusterClientStatus Instance = new();
private GetClusterClientStatus()
{
}
}
public sealed class ClusterClientStatus
{
public ClusterClientStatus(bool isAlive)
{
IsAlive = isAlive;
}
public bool IsAlive { get; }
}
public static class AkkaHostingExtensions
{
public static AkkaConfigurationBuilder WithQuarantineHealthCheck(
this AkkaConfigurationBuilder builder,
string quarantineDetectorName = "quarantine-detector")
{
builder.WithActors((system, registry, resolver) =>
{
// Create quarantine detector
var quarantineDetector = system.ActorOf(
QuarantineDetectorActor.Props(),
quarantineDetectorName);
registry.TryRegister<QuarantineDetectorActor>(quarantineDetector);
});
// Register health checks using Akka.Hosting pattern
builder.WithHealthCheck("akka.quarantine", async (system, registry, cancellationToken) =>
{
try
{
var detector = await registry.GetAsync<QuarantineDetectorActor>(cancellationToken);
var result = await detector.Ask<QuarantineStatus>(
GetQuarantineStatus.Instance,
TimeSpan.FromSeconds(5),
cancellationToken);
if (result.IsQuarantined)
return HealthCheckResult.Unhealthy(
"ActorSystem is quarantined and requires restart",
data: new Dictionary<string, object>
{
["quarantined"] = true,
["actorSystem"] = system.Name
});
return HealthCheckResult.Healthy("ActorSystem is not quarantined");
}
catch (Exception ex)
{
return HealthCheckResult.Unhealthy(
"Failed to check quarantine status",
ex);
}
});
return builder;
}
/// <summary>
/// Adds ClusterClient with comprehensive health checks including quarantine detection and liveness monitoring
/// </summary>
public static AkkaConfigurationBuilder WithClusterClientHealthCheck(
this AkkaConfigurationBuilder builder,
TimeSpan? reconnectTimeout = null,
string clusterClientMonitorName = "cluster-client-monitor")
{
// Configure reconnect timeout if specified
if (reconnectTimeout.HasValue)
builder.AddHocon($@"
akka.cluster.client {{
reconnect-timeout = {reconnectTimeout.Value.TotalSeconds}s
}}
", HoconAddMode.Prepend);
// Register monitoring actors
builder.WithActors((system, registry) =>
{
// Get the registered ClusterClient and create monitor
var clusterClient = registry.Get<ClusterClient>();
var clientMonitor = system.ActorOf(
ClusterClientMonitorActor.Props(clusterClient),
clusterClientMonitorName);
registry.TryRegister<ClusterClientMonitorActor>(clientMonitor);
});
// Register health checks using Akka.Hosting pattern
builder.WithHealthCheck("cluster-client.alive", async (system, registry, cancellationToken) =>
{
try
{
var monitor = await registry.GetAsync<ClusterClientMonitorActor>(cancellationToken);
var result = await monitor.Ask<ClusterClientStatus>(
GetClusterClientStatus.Instance,
TimeSpan.FromSeconds(5),
cancellationToken);
if (!result.IsAlive)
return HealthCheckResult.Unhealthy(
"ClusterClient is terminated and requires restart",
data: new Dictionary<string, object>
{
["clusterClientAlive"] = false,
["actorSystem"] = system.Name
});
return HealthCheckResult.Healthy("ClusterClient is alive and running");
}
catch (Exception ex)
{
return HealthCheckResult.Unhealthy(
"Failed to check ClusterClient status",
ex);
}
});
return builder;
}
/// <summary>
/// Adds ClusterClient with both quarantine and liveness health checks
/// </summary>
public static AkkaConfigurationBuilder WithClusterClientAndQuarantineHealthCheck(
this AkkaConfigurationBuilder builder,
TimeSpan? reconnectTimeout = null,
string clusterClientMonitorName = "cluster-client-monitor",
string quarantineDetectorName = "quarantine-detector")
{
// Add quarantine health check
builder.WithQuarantineHealthCheck(quarantineDetectorName);
// Add cluster client health check
builder.WithClusterClientHealthCheck(reconnectTimeout, clusterClientMonitorName);
return builder;
}
}
<Project Sdk="Microsoft.NET.Sdk.Web">
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
</PropertyGroup>
<ItemGroup>
<!-- Akka.Hosting with new health check features -->
<PackageReference Include="Akka.Hosting" Version="1.5.47-beta1"/>
<PackageReference Include="Akka.Cluster.Hosting" Version="1.5.47-beta1"/>
<PackageReference Include="Akka.Remote.Hosting" Version="1.5.47-beta1"/>
<!-- Cluster Client -->
<PackageReference Include="Akka.Cluster.Tools" Version="1.5.47-beta1"/>
<!-- Health checks -->
<PackageReference Include="AspNetCore.HealthChecks.UI.Client" Version="8.0.1"/>
<PackageReference Include="Microsoft.Extensions.Diagnostics.HealthChecks" Version="8.0.11"/>
</ItemGroup>
</Project>
using Microsoft.Extensions.Diagnostics.HealthChecks;
namespace ClusterClientHealthCheck;
public static class HealthCheckExtensions
{
/// <summary>
/// Configure tags for Akka health checks after they've been registered
/// </summary>
public static IServiceCollection ConfigureAkkaHealthCheckTags(
this IServiceCollection services,
string healthCheckName,
IEnumerable<string> tags)
{
// This is a workaround since Akka.Hosting doesn't support tags directly
// We need to configure the tags after the health checks are registered
services.Configure<HealthCheckServiceOptions>(options =>
{
foreach (var registration in options.Registrations)
if (registration.Name == healthCheckName)
{
// Clear existing tags and add new ones
registration.Tags.Clear();
foreach (var tag in tags) registration.Tags.Add(tag);
}
});
return services;
}
}
using System.Text.Json;
using Akka.Actor;
using Akka.Cluster.Hosting;
using Akka.Cluster.Tools.Client;
using Akka.Hosting;
using Akka.Remote.Hosting;
using ClusterClientHealthCheck;
using Microsoft.AspNetCore.Diagnostics.HealthChecks;
using Microsoft.Extensions.Diagnostics.HealthChecks;
var builder = WebApplication.CreateBuilder(args);
// Add health checks service (required for ASP.NET health check endpoints)
builder.Services.AddHealthChecks();
// Configure Akka.NET with cluster client and comprehensive health checks
builder.Services.AddAkka("ClusterClientSystem", (configurationBuilder, provider) =>
{
configurationBuilder
.WithRemoting("localhost", 0)
.WithClustering()
// Use our comprehensive extension method that sets up everything
// Add ClusterClient using strongly-typed method
.WithClusterClient<ClusterClient>([
ActorPath.Parse("akka.tcp://ClusterSystem@localhost:2551/system/receptionist"),
ActorPath.Parse("akka.tcp://ClusterSystem@localhost:2552/system/receptionist")
])
// Add both health checks (tags will be configured separately)
.WithClusterClientAndQuarantineHealthCheck(
TimeSpan.FromSeconds(30) // ClusterClient will terminate after 30s of failed reconnects
);
});
// Configure custom tags for the health checks
builder.Services.ConfigureAkkaHealthCheckTags("akka.quarantine",
new[] { "akka", "quarantine", "critical", "k8s" });
builder.Services.ConfigureAkkaHealthCheckTags("cluster-client.alive",
new[] { "akka", "cluster-client", "liveness", "k8s" });
var app = builder.Build();
// Map health check endpoints
app.MapHealthChecks("/health/all", new HealthCheckOptions
{
ResponseWriter = async (context, report) =>
{
context.Response.ContentType = "application/json";
var json = JsonSerializer.Serialize(new
{
status = report.Status.ToString(),
checks = report.Entries.Select(x => new
{
name = x.Key,
status = x.Value.Status.ToString(),
description = x.Value.Description,
exception = x.Value.Exception?.Message,
data = x.Value.Data
})
});
await context.Response.WriteAsync(json);
}
});
// Map liveness probe for Kubernetes
// Use tags to select which health checks are part of liveness probe
app.MapHealthChecks("/health", new HealthCheckOptions
{
Predicate = check => !check.Tags.Contains("ready"),
ResultStatusCodes =
{
[HealthStatus.Healthy] = StatusCodes.Status200OK,
[HealthStatus.Degraded] = StatusCodes.Status200OK,
[HealthStatus.Unhealthy] = StatusCodes.Status503ServiceUnavailable
}
});
// Map readiness probe for Kubernetes
// Use k8s tag to select Kubernetes-specific health checks
app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("ready"),
ResultStatusCodes =
{
[HealthStatus.Healthy] = StatusCodes.Status200OK,
[HealthStatus.Degraded] = StatusCodes.Status503ServiceUnavailable,
[HealthStatus.Unhealthy] = StatusCodes.Status503ServiceUnavailable
}
});
app.MapGet("/", () => "Cluster Client with Comprehensive Health Checks is running!");
app.Run();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment