-
-
Save SequentialDesign/5140084528d8283dcebc94eec02e4b40 to your computer and use it in GitHub Desktop.
clean string_distance.rb with proper usage example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # String distance algorithms implementation | |
| class StringDistance | |
| # Calculates the Damerau-Levenshtein distance between two strings | |
| # This includes insertion, deletion, substitution, and transposition operations | |
| def self.damerau_levenshtein(str1, str2) | |
| # Input validation | |
| str1 = str1.to_s | |
| str2 = str2.to_s | |
| return str2.length if str1.empty? | |
| return str1.length if str2.empty? | |
| # Initialize distance matrix | |
| rows = str1.length + 1 | |
| cols = str2.length + 1 | |
| matrix = Array.new(rows) { Array.new(cols, 0) } | |
| # Initialize first row and column | |
| (0...rows).each { |i| matrix[i][0] = i } | |
| (0...cols).each { |j| matrix[0][j] = j } | |
| # Fill the matrix | |
| str1_chars = str1.chars | |
| str2_chars = str2.chars | |
| (1...rows).each do |i| | |
| (1...cols).each do |j| | |
| cost = str1_chars[i - 1] == str2_chars[j - 1] ? 0 : 1 | |
| matrix[i][j] = [ | |
| matrix[i - 1][j] + 1, # deletion | |
| matrix[i][j - 1] + 1, # insertion | |
| matrix[i - 1][j - 1] + cost # substitution | |
| ].min | |
| # Check for transposition | |
| if i > 1 && j > 1 && | |
| str1_chars[i - 1] == str2_chars[j - 2] && | |
| str1_chars[i - 2] == str2_chars[j - 1] | |
| matrix[i][j] = [matrix[i][j], matrix[i - 2][j - 2] + 1].min | |
| end | |
| end | |
| end | |
| matrix[rows - 1][cols - 1] | |
| end | |
| # Calculates the Levenshtein distance between two strings | |
| # This includes insertion, deletion, and substitution operations | |
| def self.levenshtein(str1, str2) | |
| # Input validation | |
| str1 = str1.to_s | |
| str2 = str2.to_s | |
| return str2.length if str1.empty? | |
| return str1.length if str2.empty? | |
| rows = str1.length + 1 | |
| cols = str2.length + 1 | |
| matrix = Array.new(rows) { Array.new(cols, 0) } | |
| # Initialize first row and column | |
| (0...rows).each { |i| matrix[i][0] = i } | |
| (0...cols).each { |j| matrix[0][j] = j } | |
| # Fill the matrix | |
| str1_chars = str1.chars | |
| str2_chars = str2.chars | |
| (1...rows).each do |i| | |
| (1...cols).each do |j| | |
| cost = str1_chars[i - 1] == str2_chars[j - 1] ? 0 : 1 | |
| matrix[i][j] = [ | |
| matrix[i - 1][j] + 1, # deletion | |
| matrix[i][j - 1] + 1, # insertion | |
| matrix[i - 1][j - 1] + cost # substitution | |
| ].min | |
| end | |
| end | |
| matrix[rows - 1][cols - 1] | |
| end | |
| # Calculates the Sørensen-Dice coefficient between two strings | |
| # Based on 2-gram similarity (returns 0.0 to 1.0) | |
| def self.dice_coefficient(str1, str2) | |
| # Input validation | |
| str1 = str1.to_s | |
| str2 = str2.to_s | |
| return 1.0 if str1 == str2 # Identical strings | |
| str1_bigrams = ngrams(str1, 2) | |
| str2_bigrams = ngrams(str2, 2) | |
| # Handle edge cases | |
| return 0.0 if str1_bigrams.empty? || str2_bigrams.empty? | |
| intersection = (str1_bigrams & str2_bigrams).length | |
| total = str1_bigrams.length + str2_bigrams.length | |
| total.zero? ? 0.0 : (2.0 * intersection) / total | |
| end | |
| # Calculates the Hamming distance between two strings | |
| # Strings must be of equal length | |
| def self.hamming_distance(str1, str2) | |
| # Input validation | |
| str1 = str1.to_s | |
| str2 = str2.to_s | |
| raise ArgumentError, "Strings must be of equal length" if str1.length != str2.length | |
| str1.chars.zip(str2.chars).count { |char1, char2| char1 != char2 } | |
| end | |
| # Returns normalized similarity score (0.0 to 1.0) based on Levenshtein distance | |
| # Uses Levenshtein distance as the base metric for calculating similarity | |
| def self.similarity(str1, str2) | |
| str1 = str1.to_s | |
| str2 = str2.to_s | |
| return 1.0 if str1 == str2 | |
| max_len = [str1.length, str2.length].max | |
| return 0.0 if max_len == 0 | |
| distance = levenshtein(str1, str2) | |
| 1.0 - (distance.to_f / max_len) | |
| end | |
| private | |
| # Generate n-grams from a string | |
| # Returns an array of n-character substrings | |
| def self.ngrams(string, n) | |
| string = string.to_s | |
| return [] if string.empty? || n <= 0 | |
| # If n is greater than or equal to string length, return the whole string as one n-gram | |
| return [string] if n >= string.length | |
| (0..string.length - n).map { |i| string[i, n] } | |
| end | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # RSpec helper for comparing scraped strings with expected content | |
| # This is more RSpec-idiomatic using expect syntax | |
| def compare(extracted, control, required_similarity = 0.9) | |
| # Input validation with descriptive error messages | |
| raise ArgumentError, "extracted string cannot be nil" if extracted.nil? | |
| raise ArgumentError, "control string cannot be nil" if control.nil? | |
| # Normalize whitespace in both strings | |
| extracted = normalize_whitespace(extracted) | |
| control = normalize_whitespace(control) | |
| # Early return for identical strings (optimization and handles empty strings) | |
| return if extracted == control | |
| # Calculate similarity using StringDistance class | |
| similarity = StringDistance.similarity(extracted, control) | |
| # Use RSpec expect syntax with custom failure message | |
| expect(similarity).to be >= required_similarity, | |
| "Expected similarity #{required_similarity}, got #{similarity.round(3)}\n" \ | |
| "Expected: #{control.inspect}\n" \ | |
| "Actual: #{extracted.inspect}" | |
| end | |
| private | |
| # Normalize whitespace by converting tabs/newlines to spaces and collapsing multiple spaces | |
| def normalize_whitespace(str) | |
| str.to_s.gsub(/[\n\r\t]/, ' ').gsub(/\s{2,}/, ' ').strip | |
| end | |
| # Example usage in RSpec tests: | |
| # | |
| # RSpec.describe "Web Scraper" do | |
| # it "extracts product title correctly" do | |
| # scraped_title = scraper.extract_title(html) | |
| # expected_title = "MacBook Pro 14-inch M3 Chip" | |
| # | |
| # compare(scraped_title, expected_title, 0.9) | |
| # end | |
| # | |
| # it "handles minor formatting differences" do | |
| # scraped_text = "Product\n\nName:\t\tAwesome Widget" | |
| # expected_text = "Product Name: Awesome Widget" | |
| # | |
| # compare(scraped_text, expected_text, 0.85) | |
| # end | |
| # end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment