Last active
October 11, 2025 19:52
-
-
Save apcamargo/b97fdf30aa92ba32383b0b9c702420e2 to your computer and use it in GitHub Desktop.
Automatic cutoff determination for an arbitrary distribution
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import Sequence | |
| import math | |
| def find_cutoff(values: Sequence[float]) -> float: | |
| """ | |
| Determine the cutoff point in a biphasic distribution curve by identifying | |
| the "bending point" where the curve transitions from slowly growing values | |
| to rapidly growing values, using the maximum perpendicular distance method. | |
| The algorithm works by: | |
| 1. Sorts the input values in ascending order. | |
| 2. Defines a straight line connecting the first and last points of the | |
| sorted distribution. | |
| 3. Computes the perpendicular distance of each point to this line. | |
| 4. Selects the point with the maximum distance as the cutoff. | |
| Parameters | |
| ---------- | |
| values : Sequence[float] | |
| A sequence of numerical values (e.g., gene expression levels, protein | |
| abundances). Values can be in any order. | |
| Returns | |
| ------- | |
| float | |
| The cutoff value corresponding to the point with the maximum | |
| perpendicular distance from the baseline line. | |
| Notes | |
| ----- | |
| This method assumes: | |
| - The data follows a biphasic distribution (many low values, few high values), | |
| with a clear inflection (knee) separating low and high values. | |
| - A small number of variables with high values dominate the biological system | |
| and determine its major processes and functions. | |
| References | |
| ---------- | |
| .. [1] Suvorov, Alexander. "Simple method for cutoff point identification | |
| in descriptive high-throughput biological studies." BMC Genomics 23.1 | |
| (2022): 204. | |
| """ | |
| # Sort input values in ascending order | |
| sorted_values = sorted(values) | |
| distances = [] | |
| # Line between first and last points: slope (main_slope) and intercept (main_intercept) | |
| main_slope = sorted_values[-1] / len(sorted_values) | |
| main_intercept = 0 | |
| # Compute perpendicular distance from each point to the main line | |
| for idx in range(len(sorted_values)): | |
| y_value = sorted_values[idx] | |
| # Slope of the perpendicular line | |
| perp_intercept = y_value - (-1 / main_slope) * idx | |
| # Intersection point between main line and perpendicular line | |
| intersect_y = (main_intercept + perp_intercept * main_slope**2) / ( | |
| 1 + main_slope**2 | |
| ) | |
| intersect_x = (intersect_y - main_intercept) / main_slope | |
| # Euclidean distance between the point and intersection | |
| distance = math.sqrt((intersect_x - idx) ** 2 + (intersect_y - y_value) ** 2) | |
| distances.append(distance) | |
| # Index of point with maximum perpendicular distance | |
| cutoff_index = distances.index(max(distances)) | |
| return sorted_values[cutoff_index] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment