Skip to content

Instantly share code, notes, and snippets.

@ogxd
Created March 20, 2025 20:41
Show Gist options
  • Select an option

  • Save ogxd/b22adad991fbb81b302cd7159c00c548 to your computer and use it in GitHub Desktop.

Select an option

Save ogxd/b22adad991fbb81b302cd7159c00c548 to your computer and use it in GitHub Desktop.
Pre-Allocated Collections
#pragma warning disable CA2012
#pragma warning disable CA1816
using System;
using System.Collections;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Threading;
public class PreAllocatedCollectionsFactory<T> where T : class, ICollection
{
private readonly int _windowSize;
private readonly int _maxCapacity;
private readonly int _minIntervalMs;
private readonly double _factor;
private readonly double _spreadThreshold;
private readonly Func<int, T> _collectionConstructorWithCapacity;
private readonly ConditionalWeakTable<T, Finalizer> _weakTable;
private readonly OptimizedLinkedList<int> _measurements;
private int _sum;
private int _sumOfSquares;
private int _recommendedCapacity;
private long _lastExecutionTime;
/// <summary>
/// Create a new instance of <see cref="PreAllocatedCollectionsFactory{T}"/> to create collections with pre-allocated capacity,
/// based on previous measurements on size of collections created by this factory.
/// </summary>
/// <param name="collectionConstructorWithCapacity">Delegate function to create collection with some computed capacity.</param>
/// <param name="maxCapacity">Maximum capacity this factory may use for creating collections.</param>
/// <param name="windowSize">Windows size of computing sliding-average and sliding-standard deviation.</param>
/// <param name="minIntervalMs">Minimal time interval between two measurements.</param>
/// <param name="spreadThreshold">Threshold to consider the heuristic reliable. When the threshold (meaning that measured counts are too spread out), the factory will use a capacity of 0.</param>
/// <param name="factor">Amplification factor over capacity to use.</param>
public PreAllocatedCollectionsFactory(
Func<int, T> collectionConstructorWithCapacity,
int maxCapacity = 65536,
int windowSize = 1_000,
int minIntervalMs = 100,
double spreadThreshold = 0.5,
double factor = 1.3)
{
_collectionConstructorWithCapacity = collectionConstructorWithCapacity;
_windowSize = windowSize;
_maxCapacity = maxCapacity;
_minIntervalMs = minIntervalMs;
_factor = factor;
_spreadThreshold = spreadThreshold;
_weakTable = new();
_measurements = new(_windowSize + 1);
}
public T New()
{
T collection = _collectionConstructorWithCapacity(_recommendedCapacity);
TryTrack(collection);
return collection;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void TryTrack(T collection)
{
long now = Stopwatch.GetTimestamp();
long lastTime = Interlocked.Read(ref _lastExecutionTime);
if (Stopwatch.GetElapsedTime(lastTime, now).TotalMilliseconds >= _minIntervalMs)
{
if (Interlocked.CompareExchange(ref _lastExecutionTime, now, lastTime) == lastTime)
{
// The ConditionalWeakTable allows us to link of finalize lifetime with the collection lifetime,
// such that the finalize deconstructor is called when no more entries are added to the collection.
// This way we can measure the "workable" capacity of the collection.
_weakTable.Add(collection, new Finalizer(collection, this));
}
}
}
private void Update(int count)
{
// Thanks to the minIntervalMs parameter, we expect this method to not be called too often,
// avoiding contention on the underlying lock.
lock (_measurements)
{
// Compute sliding sum and sliding sum of squares without iterating thanks to the linked list.
// This then allows efficient computation of mean and variance.
_measurements.AddLast(count);
_sum += count;
_sumOfSquares += count * count;
if (_measurements.Count >= _windowSize)
{
_measurements.RemoveFirst(out int removedValue);
_sum -= removedValue;
_sumOfSquares -= removedValue * removedValue;
}
double mean = 1d * _sum / _measurements.Count;
// This is known as the "shortcut formula" for variance. It allows to compute variance in a sliding manner.
double variance = (1d * _sumOfSquares / _measurements.Count) - (mean * mean);
double stddev = Math.Sqrt(variance);
// We want to have an estimation of how spreaded the measurements are.
// For this, we can of the Margin of Error (MoE): https://en.wikipedia.org/wiki/Margin_of_error
// The margin of error is expressed as: MoE = z * (stddev / sqrt(n)) -- in our where z = 1
// We would like something normalized to [0 - 1] so we can use a threshold to decided if values are too spreaded
// for our heuristic to be reliable.
// To do so, we must divide the MoE by the mean of the measurements:
// spread = MoE / mean = (stddev / sqrt(n)) / (sum / n) = stddev * sqrt(n) / sum ✨
double spread = stddev * Math.Sqrt(_measurements.Count) / _sum;
// If the spread is too high, we consider the heuristic unreliable and we set the capacity to 0.
_recommendedCapacity = spread > _spreadThreshold ? 0 : Math.Min(_maxCapacity, (int)(_factor * mean));
}
}
internal class Finalizer(T collection, PreAllocatedCollectionsFactory<T> factory)
{
// The finalizer has a small overhead, but is acceptable here as it is not called too often thanks to the minIntervalMs parameter.
~Finalizer()
{
factory.Update(collection.Count);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment