Created
March 20, 2025 20:41
-
-
Save ogxd/b22adad991fbb81b302cd7159c00c548 to your computer and use it in GitHub Desktop.
Pre-Allocated Collections
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #pragma warning disable CA2012 | |
| #pragma warning disable CA1816 | |
| using System; | |
| using System.Collections; | |
| using System.Diagnostics; | |
| using System.Runtime.CompilerServices; | |
| using System.Threading; | |
| public class PreAllocatedCollectionsFactory<T> where T : class, ICollection | |
| { | |
| private readonly int _windowSize; | |
| private readonly int _maxCapacity; | |
| private readonly int _minIntervalMs; | |
| private readonly double _factor; | |
| private readonly double _spreadThreshold; | |
| private readonly Func<int, T> _collectionConstructorWithCapacity; | |
| private readonly ConditionalWeakTable<T, Finalizer> _weakTable; | |
| private readonly OptimizedLinkedList<int> _measurements; | |
| private int _sum; | |
| private int _sumOfSquares; | |
| private int _recommendedCapacity; | |
| private long _lastExecutionTime; | |
| /// <summary> | |
| /// Create a new instance of <see cref="PreAllocatedCollectionsFactory{T}"/> to create collections with pre-allocated capacity, | |
| /// based on previous measurements on size of collections created by this factory. | |
| /// </summary> | |
| /// <param name="collectionConstructorWithCapacity">Delegate function to create collection with some computed capacity.</param> | |
| /// <param name="maxCapacity">Maximum capacity this factory may use for creating collections.</param> | |
| /// <param name="windowSize">Windows size of computing sliding-average and sliding-standard deviation.</param> | |
| /// <param name="minIntervalMs">Minimal time interval between two measurements.</param> | |
| /// <param name="spreadThreshold">Threshold to consider the heuristic reliable. When the threshold (meaning that measured counts are too spread out), the factory will use a capacity of 0.</param> | |
| /// <param name="factor">Amplification factor over capacity to use.</param> | |
| public PreAllocatedCollectionsFactory( | |
| Func<int, T> collectionConstructorWithCapacity, | |
| int maxCapacity = 65536, | |
| int windowSize = 1_000, | |
| int minIntervalMs = 100, | |
| double spreadThreshold = 0.5, | |
| double factor = 1.3) | |
| { | |
| _collectionConstructorWithCapacity = collectionConstructorWithCapacity; | |
| _windowSize = windowSize; | |
| _maxCapacity = maxCapacity; | |
| _minIntervalMs = minIntervalMs; | |
| _factor = factor; | |
| _spreadThreshold = spreadThreshold; | |
| _weakTable = new(); | |
| _measurements = new(_windowSize + 1); | |
| } | |
| public T New() | |
| { | |
| T collection = _collectionConstructorWithCapacity(_recommendedCapacity); | |
| TryTrack(collection); | |
| return collection; | |
| } | |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | |
| private void TryTrack(T collection) | |
| { | |
| long now = Stopwatch.GetTimestamp(); | |
| long lastTime = Interlocked.Read(ref _lastExecutionTime); | |
| if (Stopwatch.GetElapsedTime(lastTime, now).TotalMilliseconds >= _minIntervalMs) | |
| { | |
| if (Interlocked.CompareExchange(ref _lastExecutionTime, now, lastTime) == lastTime) | |
| { | |
| // The ConditionalWeakTable allows us to link of finalize lifetime with the collection lifetime, | |
| // such that the finalize deconstructor is called when no more entries are added to the collection. | |
| // This way we can measure the "workable" capacity of the collection. | |
| _weakTable.Add(collection, new Finalizer(collection, this)); | |
| } | |
| } | |
| } | |
| private void Update(int count) | |
| { | |
| // Thanks to the minIntervalMs parameter, we expect this method to not be called too often, | |
| // avoiding contention on the underlying lock. | |
| lock (_measurements) | |
| { | |
| // Compute sliding sum and sliding sum of squares without iterating thanks to the linked list. | |
| // This then allows efficient computation of mean and variance. | |
| _measurements.AddLast(count); | |
| _sum += count; | |
| _sumOfSquares += count * count; | |
| if (_measurements.Count >= _windowSize) | |
| { | |
| _measurements.RemoveFirst(out int removedValue); | |
| _sum -= removedValue; | |
| _sumOfSquares -= removedValue * removedValue; | |
| } | |
| double mean = 1d * _sum / _measurements.Count; | |
| // This is known as the "shortcut formula" for variance. It allows to compute variance in a sliding manner. | |
| double variance = (1d * _sumOfSquares / _measurements.Count) - (mean * mean); | |
| double stddev = Math.Sqrt(variance); | |
| // We want to have an estimation of how spreaded the measurements are. | |
| // For this, we can of the Margin of Error (MoE): https://en.wikipedia.org/wiki/Margin_of_error | |
| // The margin of error is expressed as: MoE = z * (stddev / sqrt(n)) -- in our where z = 1 | |
| // We would like something normalized to [0 - 1] so we can use a threshold to decided if values are too spreaded | |
| // for our heuristic to be reliable. | |
| // To do so, we must divide the MoE by the mean of the measurements: | |
| // spread = MoE / mean = (stddev / sqrt(n)) / (sum / n) = stddev * sqrt(n) / sum ✨ | |
| double spread = stddev * Math.Sqrt(_measurements.Count) / _sum; | |
| // If the spread is too high, we consider the heuristic unreliable and we set the capacity to 0. | |
| _recommendedCapacity = spread > _spreadThreshold ? 0 : Math.Min(_maxCapacity, (int)(_factor * mean)); | |
| } | |
| } | |
| internal class Finalizer(T collection, PreAllocatedCollectionsFactory<T> factory) | |
| { | |
| // The finalizer has a small overhead, but is acceptable here as it is not called too often thanks to the minIntervalMs parameter. | |
| ~Finalizer() | |
| { | |
| factory.Update(collection.Count); | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment