SequentialDesign · September 19, 2025 20:32
diff --git a/string_distance.rb b/string_distance.rb
 # String distance algorithms implementation
 class StringDistance
  # Calculates the Damerau-Levenshtein distance between two strings
  # This includes insertion, deletion, substitution, and transposition operations
  def self.damerau_levenshtein(str1, str2)
    # Input validation
    str1 = str1.to_s
    str2 = str2.to_s
    
    return str2.length if str1.empty?
    return str1.length if str2.empty?

    # Initialize distance matrix
    rows = str1.length + 1
    cols = str2.length + 1
    matrix = Array.new(rows) { Array.new(cols, 0) }

    # Initialize first row and column
    (0...rows).each { |i| matrix[i][0] = i }
    (0...cols).each { |j| matrix[0][j] = j }

    # Fill the matrix
    str1_chars = str1.chars
    str2_chars = str2.chars

    (1...rows).each do |i|
      (1...cols).each do |j|
        cost = str1_chars[i - 1] == str2_chars[j - 1] ? 0 : 1

        matrix[i][j] = [
          matrix[i - 1][j] + 1,     # deletion
          matrix[i][j - 1] + 1,     # insertion
          matrix[i - 1][j - 1] + cost # substitution
        ].min

        # Check for transposition
        if i > 1 && j > 1 && 
           str1_chars[i - 1] == str2_chars[j - 2] && 
           str1_chars[i - 2] == str2_chars[j - 1]
          matrix[i][j] = [matrix[i][j], matrix[i - 2][j - 2] + 1].min
        end
      end
    end

    matrix[rows - 1][cols - 1]
  end

  # Calculates the Levenshtein distance between two strings
  # This includes insertion, deletion, and substitution operations
  def self.levenshtein(str1, str2)
    # Input validation
    str1 = str1.to_s
    str2 = str2.to_s
    
    return str2.length if str1.empty?
    return str1.length if str2.empty?

    rows = str1.length + 1
    cols = str2.length + 1
    matrix = Array.new(rows) { Array.new(cols, 0) }

    # Initialize first row and column
    (0...rows).each { |i| matrix[i][0] = i }
    (0...cols).each { |j| matrix[0][j] = j }

    # Fill the matrix
    str1_chars = str1.chars
    str2_chars = str2.chars

    (1...rows).each do |i|
      (1...cols).each do |j|
        cost = str1_chars[i - 1] == str2_chars[j - 1] ? 0 : 1

        matrix[i][j] = [
          matrix[i - 1][j] + 1,     # deletion
          matrix[i][j - 1] + 1,     # insertion
          matrix[i - 1][j - 1] + cost # substitution
        ].min
      end
    end

    matrix[rows - 1][cols - 1]
  end

  # Calculates the Sørensen-Dice coefficient between two strings
  # Based on 2-gram similarity (returns 0.0 to 1.0)
  def self.dice_coefficient(str1, str2)
    # Input validation
    str1 = str1.to_s
    str2 = str2.to_s
    
    return 1.0 if str1 == str2 # Identical strings
    
    str1_bigrams = ngrams(str1, 2)
    str2_bigrams = ngrams(str2, 2)
    
    # Handle edge cases
    return 0.0 if str1_bigrams.empty? || str2_bigrams.empty?

    intersection = (str1_bigrams & str2_bigrams).length
    total = str1_bigrams.length + str2_bigrams.length
    
    total.zero? ? 0.0 : (2.0 * intersection) / total
  end

  # Calculates the Hamming distance between two strings
  # Strings must be of equal length
  def self.hamming_distance(str1, str2)
    # Input validation
    str1 = str1.to_s
    str2 = str2.to_s
    
    raise ArgumentError, "Strings must be of equal length" if str1.length != str2.length

    str1.chars.zip(str2.chars).count { |char1, char2| char1 != char2 }
  end

  # Returns normalized similarity score (0.0 to 1.0) based on Levenshtein distance
  # Uses Levenshtein distance as the base metric for calculating similarity
  def self.similarity(str1, str2)
    str1 = str1.to_s
    str2 = str2.to_s
    
    return 1.0 if str1 == str2
    
    max_len = [str1.length, str2.length].max
    return 0.0 if max_len == 0
    
    distance = levenshtein(str1, str2)
    1.0 - (distance.to_f / max_len)
  end

  private

  # Generate n-grams from a string
  # Returns an array of n-character substrings
  def self.ngrams(string, n)
    string = string.to_s
    return [] if string.empty? || n <= 0
    
    # If n is greater than or equal to string length, return the whole string as one n-gram
    return [string] if n >= string.length

    (0..string.length - n).map { |i| string[i, n] }
  end
 end
diff --git a/usage.rb b/usage.rb
 # RSpec helper for comparing scraped strings with expected content
 # This is more RSpec-idiomatic using expect syntax

 def compare(extracted, control, required_similarity = 0.9)
  # Input validation with descriptive error messages
  raise ArgumentError, "extracted string cannot be nil" if extracted.nil?
  raise ArgumentError, "control string cannot be nil" if control.nil?
  
  # Normalize whitespace in both strings
  extracted = normalize_whitespace(extracted)
  control = normalize_whitespace(control)
  
  # Early return for identical strings (optimization and handles empty strings)
  return if extracted == control
  
  # Calculate similarity using StringDistance class
  similarity = StringDistance.similarity(extracted, control)
  
  # Use RSpec expect syntax with custom failure message
  expect(similarity).to be >= required_similarity, 
    "Expected similarity #{required_similarity}, got #{similarity.round(3)}\n" \
    "Expected: #{control.inspect}\n" \
    "Actual:   #{extracted.inspect}"
 end

 private

 # Normalize whitespace by converting tabs/newlines to spaces and collapsing multiple spaces
 def normalize_whitespace(str)
  str.to_s.gsub(/[\n\r\t]/, ' ').gsub(/\s{2,}/, ' ').strip
 end

 # Example usage in RSpec tests:
 #
 # RSpec.describe "Web Scraper" do
 #   it "extracts product title correctly" do
 #     scraped_title = scraper.extract_title(html)
 #     expected_title = "MacBook Pro 14-inch M3 Chip"
 #     
 #     compare(scraped_title, expected_title, 0.9)
 #   end
 #   
 #   it "handles minor formatting differences" do
 #     scraped_text = "Product\n\nName:\t\tAwesome Widget"
 #     expected_text = "Product Name: Awesome Widget"
 #     
 #     compare(scraped_text, expected_text, 0.85)
 #   end
 # end
	# String distance algorithms implementation
	class StringDistance
	# Calculates the Damerau-Levenshtein distance between two strings
	# This includes insertion, deletion, substitution, and transposition operations
	def self.damerau_levenshtein(str1, str2)
	# Input validation
	str1 = str1.to_s
	str2 = str2.to_s

	return str2.length if str1.empty?
	return str1.length if str2.empty?

	# Initialize distance matrix
	rows = str1.length + 1
	cols = str2.length + 1
	matrix = Array.new(rows) { Array.new(cols, 0) }

	# Initialize first row and column
	(0...rows).each { \|i\| matrix[i][0] = i }
	(0...cols).each { \|j\| matrix[0][j] = j }

	# Fill the matrix
	str1_chars = str1.chars
	str2_chars = str2.chars

	(1...rows).each do \|i\|
	(1...cols).each do \|j\|
	cost = str1_chars[i - 1] == str2_chars[j - 1] ? 0 : 1

	matrix[i][j] = [
	matrix[i - 1][j] + 1, # deletion
	matrix[i][j - 1] + 1, # insertion
	matrix[i - 1][j - 1] + cost # substitution
	].min

	# Check for transposition
	if i > 1 && j > 1 &&
	str1_chars[i - 1] == str2_chars[j - 2] &&
	str1_chars[i - 2] == str2_chars[j - 1]
	matrix[i][j] = [matrix[i][j], matrix[i - 2][j - 2] + 1].min
	end
	end
	end

	matrix[rows - 1][cols - 1]
	end

	# Calculates the Levenshtein distance between two strings
	# This includes insertion, deletion, and substitution operations
	def self.levenshtein(str1, str2)
	# Input validation
	str1 = str1.to_s
	str2 = str2.to_s

	return str2.length if str1.empty?
	return str1.length if str2.empty?

	rows = str1.length + 1
	cols = str2.length + 1
	matrix = Array.new(rows) { Array.new(cols, 0) }

	# Initialize first row and column
	(0...rows).each { \|i\| matrix[i][0] = i }
	(0...cols).each { \|j\| matrix[0][j] = j }

	# Fill the matrix
	str1_chars = str1.chars
	str2_chars = str2.chars

	(1...rows).each do \|i\|
	(1...cols).each do \|j\|
	cost = str1_chars[i - 1] == str2_chars[j - 1] ? 0 : 1

	matrix[i][j] = [
	matrix[i - 1][j] + 1, # deletion
	matrix[i][j - 1] + 1, # insertion
	matrix[i - 1][j - 1] + cost # substitution
	].min
	end
	end

	matrix[rows - 1][cols - 1]
	end

	# Calculates the Sørensen-Dice coefficient between two strings
	# Based on 2-gram similarity (returns 0.0 to 1.0)
	def self.dice_coefficient(str1, str2)
	# Input validation
	str1 = str1.to_s
	str2 = str2.to_s

	return 1.0 if str1 == str2 # Identical strings

	str1_bigrams = ngrams(str1, 2)
	str2_bigrams = ngrams(str2, 2)

	# Handle edge cases
	return 0.0 if str1_bigrams.empty? \|\| str2_bigrams.empty?

	intersection = (str1_bigrams & str2_bigrams).length
	total = str1_bigrams.length + str2_bigrams.length

	total.zero? ? 0.0 : (2.0 * intersection) / total
	end

	# Calculates the Hamming distance between two strings
	# Strings must be of equal length
	def self.hamming_distance(str1, str2)
	# Input validation
	str1 = str1.to_s
	str2 = str2.to_s

	raise ArgumentError, "Strings must be of equal length" if str1.length != str2.length

	str1.chars.zip(str2.chars).count { \|char1, char2\| char1 != char2 }
	end

	# Returns normalized similarity score (0.0 to 1.0) based on Levenshtein distance
	# Uses Levenshtein distance as the base metric for calculating similarity
	def self.similarity(str1, str2)
	str1 = str1.to_s
	str2 = str2.to_s

	return 1.0 if str1 == str2

	max_len = [str1.length, str2.length].max
	return 0.0 if max_len == 0

	distance = levenshtein(str1, str2)
	1.0 - (distance.to_f / max_len)
	end

	private

	# Generate n-grams from a string
	# Returns an array of n-character substrings
	def self.ngrams(string, n)
	string = string.to_s
	return [] if string.empty? \|\| n <= 0

	# If n is greater than or equal to string length, return the whole string as one n-gram
	return [string] if n >= string.length

	(0..string.length - n).map { \|i\| string[i, n] }
	end
	end
	# RSpec helper for comparing scraped strings with expected content
	# This is more RSpec-idiomatic using expect syntax

	def compare(extracted, control, required_similarity = 0.9)
	# Input validation with descriptive error messages
	raise ArgumentError, "extracted string cannot be nil" if extracted.nil?
	raise ArgumentError, "control string cannot be nil" if control.nil?

	# Normalize whitespace in both strings
	extracted = normalize_whitespace(extracted)
	control = normalize_whitespace(control)

	# Early return for identical strings (optimization and handles empty strings)
	return if extracted == control

	# Calculate similarity using StringDistance class
	similarity = StringDistance.similarity(extracted, control)

	# Use RSpec expect syntax with custom failure message
	expect(similarity).to be >= required_similarity,
	"Expected similarity #{required_similarity}, got #{similarity.round(3)}\n" \
	"Expected: #{control.inspect}\n" \
	"Actual: #{extracted.inspect}"
	end

	private

	# Normalize whitespace by converting tabs/newlines to spaces and collapsing multiple spaces
	def normalize_whitespace(str)
	str.to_s.gsub(/[\n\r\t]/, ' ').gsub(/\s{2,}/, ' ').strip
	end

	# Example usage in RSpec tests:
	#
	# RSpec.describe "Web Scraper" do
	# it "extracts product title correctly" do
	# scraped_title = scraper.extract_title(html)
	# expected_title = "MacBook Pro 14-inch M3 Chip"
	#
	# compare(scraped_title, expected_title, 0.9)
	# end
	#
	# it "handles minor formatting differences" do
	# scraped_text = "Product\n\nName:\t\tAwesome Widget"
	# expected_text = "Product Name: Awesome Widget"
	#
	# compare(scraped_text, expected_text, 0.85)
	# end
	# end