Created
February 21, 2025 00:50
-
-
Save johnrcui/a1d06daa80904ec13b300441847598b3 to your computer and use it in GitHub Desktop.
Levenshtein distance in MySQL with short circuit logic
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| DELIMITER // | |
| -- Compute the Levenshtein distance between two strings within a given max distance | |
| -- If any point it becomes certain max distance threshold will be crossed, the function | |
| -- short circuits and returns immedately with a value of `max_distance + 1` | |
| -- | |
| -- NOTE: This code was generated and optimized using DeepSeek R1 | |
| -- | |
| CREATE FUNCTION MAX_LEVENSHTEIN(s1 VARCHAR(255), s2 VARCHAR(255), max_distance INT) RETURNS INT | |
| BEGIN | |
| DECLARE s1_len, s2_len, i, j, val, cost, current_min, diag, up, left_val INT; | |
| DECLARE prev_row, curr_row TEXT; | |
| SET s1_len = CHAR_LENGTH(s1), s2_len = CHAR_LENGTH(s2); | |
| -- Early return if strings are identical | |
| IF s1 = s2 THEN | |
| RETURN 0; | |
| END IF; | |
| -- Check if the difference in lengths exceeds max_distance | |
| IF ABS(s1_len - s2_len) > max_distance THEN | |
| RETURN max_distance + 1; | |
| END IF; | |
| -- Handle edge cases where either string is empty | |
| IF s1_len = 0 THEN | |
| RETURN IF(s2_len <= max_distance, s2_len, max_distance + 1); | |
| END IF; | |
| IF s2_len = 0 THEN | |
| RETURN IF(s1_len <= max_distance, s1_len, max_distance + 1); | |
| END IF; | |
| -- Initialize the previous row of distances | |
| SET prev_row = ''; | |
| SET j = 0; | |
| WHILE j <= s2_len DO | |
| SET prev_row = CONCAT(prev_row, ',', j); | |
| SET j = j + 1; | |
| END WHILE; | |
| SET prev_row = SUBSTRING(prev_row FROM 2); | |
| -- Iterate over each character in s1 | |
| SET i = 1; | |
| WHILE i <= s1_len DO | |
| SET curr_row = ''; | |
| SET current_min = 2147483647; -- Initialize to a large value | |
| SET j = 0; | |
| WHILE j <= s2_len DO | |
| IF j = 0 THEN | |
| -- First column represents transforming to empty string | |
| SET val = i; | |
| ELSE | |
| -- Calculate cost based on current characters | |
| IF SUBSTRING(s1, i, 1) = SUBSTRING(s2, j, 1) THEN | |
| SET cost = 0; | |
| ELSE | |
| SET cost = 1; | |
| END IF; | |
| -- Retrieve previous row values for diagonal, up, and left | |
| SET diag = CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(prev_row, ',', j), ',', -1) AS UNSIGNED); | |
| IF j > 0 THEN | |
| SET up = CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(prev_row, ',', j + 1), ',', -1) AS UNSIGNED); | |
| ELSE | |
| SET up = i; | |
| END IF; | |
| IF LENGTH(curr_row) > 0 THEN | |
| SET left_val = CAST(SUBSTRING_INDEX(curr_row, ',', -1) AS UNSIGNED); | |
| ELSE | |
| SET left_val = i; | |
| END IF; | |
| -- Compute the current cell value | |
| SET val = LEAST(up + 1, left_val + 1, diag + cost); | |
| END IF; | |
| -- Update current row and track the minimum value | |
| SET curr_row = CONCAT(curr_row, ',', val); | |
| IF val < current_min THEN | |
| SET current_min = val; | |
| END IF; | |
| SET j = j + 1; | |
| END WHILE; | |
| SET curr_row = SUBSTRING(curr_row FROM 2); | |
| -- Short-circuit if current minimum exceeds max_distance | |
| IF current_min > max_distance THEN | |
| RETURN max_distance + 1; | |
| END IF; | |
| -- Prepare for next iteration | |
| SET prev_row = curr_row; | |
| SET i = i + 1; | |
| END WHILE; | |
| -- Extract the final Levenshtein distance from the last row | |
| SET val = CAST(SUBSTRING_INDEX(prev_row, ',', -1) AS UNSIGNED); | |
| -- Return the result if within max_distance, else max_distance + 1 | |
| RETURN IF(val <= max_distance, val, max_distance + 1); | |
| END// | |
| DELIMITER ; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment