SequentialDesign · June 2, 2025 18:51
diff --git a/combined.rb b/combined.rb
 require "csv"
 require "progress_bar"

 input_file  = "movie_subtitles_en_new.txt"
 output_dir  = "split_files"
 Dir.mkdir(output_dir) unless Dir.exist?(output_dir)

 # configuration
 MIN_PHRASE_LEN = 2
 MAX_PHRASE_LEN = 7
 MIN_PHRASE_COUNT = 2

 # sanitize characters not in UTF-8 by erasing them
 santized_input_file = open(input_file, "r") { |io| io.read.encode("UTF-8", invalid: :replace, replace: "") }

 puts "splitting input file into sentences..."
 text = santized_input_file
  .gsub(/[A-Z].*?:/, '')      # Remove uppercase words followed by a colon
  .gsub(/--/, '')             # Remove double hyphens
  .gsub(/ - /, '')            # Remove double hyphens
 	.gsub(/\s*'\s*/, "'")				# fix inconsistent '
  .gsub(/OK/, 'okay')    			# Replace "OK" with "okay"
  .squeeze(' ')               # Replace multiple spaces with a single space
  .strip                      # Remove leading/trailing whitespace
  .gsub(/\.{3}/, '')          # Remove three dots 
 	.gsub(/\++\$\++/, '')				# get rid of weird string
 sentences = text.downcase.gsub(/all right/, "alright").gsub(/[¡¿"]/, "").split(/\n/)

 # initialize progress bar for file splitting
 bar = ProgressBar.new(sentences.size / 5_000 + 1)
 puts "creating chunk files..."
 sentences.each_slice(5_000).with_index do |chunk, index|
  File.write("#{output_dir}/chunk_#{index + 1}.txt", chunk.join("\n"))
  bar.increment!
 end

 # files to generate
 word_csv = 'word_counts.csv'
 word_clean_csv = 'word_counts_no_commas.csv'
 phrase_csv = 'phrase_counts.csv'
 phrase_clean_csv = 'phrase_counts_no_commas.csv'

 # initialize CSVs
 [word_csv, phrase_csv, phrase_clean_csv].each do |file|
  CSV.open(file, 'w') { |csv| csv << ['length', 'phrase', 'count'] } unless File.exist?(file)
 end

 puts "processing chunks..."
 chunk_files = Dir.children(output_dir)
 main_bar = ProgressBar.new(chunk_files.size) # for both processing stages

 # initialize counters
 word_counts = {}
 phrase_counts = Hash.new { |h,k| h[k] = Hash.new(0) }
 phrase_counts_clean = Hash.new { |h,k| h[k] = Hash.new(0) }

 chunk_files.each do |filename|
  chunk_path = "#{output_dir}/#{filename}"
  text = File.read(chunk_path)
  
  # word processing
  words = text.split
  words.each { |word| word_counts[word] = (word_counts[word] || 0) + 1 }
  
  # phrase processing - two versions
  # 1. original version with punctuation
  words_with_punct = text.split
  (MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n|
    next if words_with_punct.size < n
    words_with_punct.each_cons(n) do |words|
      phrase = words.join(' ')
      phrase_counts[n][phrase] += 1
    end
  end
  
  # 2. clean version without punctuation
  clean_words = text.gsub(/[.,!?]/, ' ').split
  (MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n|
    next if clean_words.size < n
    clean_words.each_cons(n) do |words|
      phrase = words.join(' ')
      phrase_counts_clean[n][phrase] += 1
    end
  end
  
  main_bar.increment!
 end
  
 # save word counts
 puts "\nSaving word counts..."
 CSV.open(word_csv, 'w') do |csv|
  csv << ['word', 'count']
  word_counts.sort_by { |w,c| -c }.each { |w,c| csv << [w,c] }
 end

 # save cleaned word counts (original functionality)
 cleaned_words = Hash.new(0)
 word_counts.each { |w,c| cleaned_words[w.gsub(/,/,'')] += c }

 CSV.open(word_clean_csv, 'w') do |csv|
  csv << ['word', 'count']
  cleaned_words.sort_by { |w,c| -c }.each { |w,c| csv << [w,c] }
 end

 # save phrase counts (with punctuation)
 puts "saving phrase counts (original)..."
 CSV.open(phrase_csv, 'w') do |csv|
  csv << ['length', 'phrase', 'count']
  (MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n|
    phrase_counts[n]
      .select { |p,c| c >= MIN_PHRASE_COUNT }
      .sort_by { |p,c| -c }
      .each { |p,c| csv << [n, p, c] }
  end
 end

 # save cleaned phrase counts (without punctuation)
 puts "Saving phrase counts (no commas)..."
 CSV.open(phrase_clean_csv, 'w') do |csv|
  csv << ['length', 'phrase', 'count']
  (MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n|
    phrase_counts_clean[n]
      .select { |p,c| c >= MIN_PHRASE_COUNT }
      .sort_by { |p,c| -c }
      .each { |p,c| csv << [n, p, c] }
  end
 end

 # final report
 total_phrases = (MIN_PHRASE_LEN..MAX_PHRASE_LEN).sum { |n| phrase_counts[n].size }
 total_phrases_clean = (MIN_PHRASE_LEN..MAX_PHRASE_LEN).sum { |n| phrase_counts_clean[n].size }

 puts "\nall done !"
 puts "word counts:".ljust(30) + "#{word_counts.size} unique words"
 puts "cleaned words:".ljust(30) + "#{cleaned_words.size} merged entries"
 puts "phrase counts (original):".ljust(30) + "#{total_phrases} phrases"
 puts "phrase counts (no commas):".ljust(30) + "#{total_phrases_clean} phrases"
 puts "files created:".ljust(30) + "#{word_csv}, #{word_clean_csv}, #{phrase_csv}, #{phrase_clean_csv}"
	require "csv"
	require "progress_bar"

	input_file = "movie_subtitles_en_new.txt"
	output_dir = "split_files"
	Dir.mkdir(output_dir) unless Dir.exist?(output_dir)

	# configuration
	MIN_PHRASE_LEN = 2
	MAX_PHRASE_LEN = 7
	MIN_PHRASE_COUNT = 2

	# sanitize characters not in UTF-8 by erasing them
	santized_input_file = open(input_file, "r") { \|io\| io.read.encode("UTF-8", invalid: :replace, replace: "") }

	puts "splitting input file into sentences..."
	text = santized_input_file
	.gsub(/[A-Z].*?:/, '') # Remove uppercase words followed by a colon
	.gsub(/--/, '') # Remove double hyphens
	.gsub(/ - /, '') # Remove double hyphens
	.gsub(/\s'\s/, "'") # fix inconsistent '
	.gsub(/OK/, 'okay') # Replace "OK" with "okay"
	.squeeze(' ') # Replace multiple spaces with a single space
	.strip # Remove leading/trailing whitespace
	.gsub(/\.{3}/, '') # Remove three dots
	.gsub(/\++\$\++/, '') # get rid of weird string
	sentences = text.downcase.gsub(/all right/, "alright").gsub(/[¡¿"]/, "").split(/\n/)

	# initialize progress bar for file splitting
	bar = ProgressBar.new(sentences.size / 5_000 + 1)
	puts "creating chunk files..."
	sentences.each_slice(5_000).with_index do \|chunk, index\|
	File.write("#{output_dir}/chunk_#{index + 1}.txt", chunk.join("\n"))
	bar.increment!
	end

	# files to generate
	word_csv = 'word_counts.csv'
	word_clean_csv = 'word_counts_no_commas.csv'
	phrase_csv = 'phrase_counts.csv'
	phrase_clean_csv = 'phrase_counts_no_commas.csv'

	# initialize CSVs
	[word_csv, phrase_csv, phrase_clean_csv].each do \|file\|
	CSV.open(file, 'w') { \|csv\| csv << ['length', 'phrase', 'count'] } unless File.exist?(file)
	end

	puts "processing chunks..."
	chunk_files = Dir.children(output_dir)
	main_bar = ProgressBar.new(chunk_files.size) # for both processing stages

	# initialize counters
	word_counts = {}
	phrase_counts = Hash.new { \|h,k\| h[k] = Hash.new(0) }
	phrase_counts_clean = Hash.new { \|h,k\| h[k] = Hash.new(0) }

	chunk_files.each do \|filename\|
	chunk_path = "#{output_dir}/#{filename}"
	text = File.read(chunk_path)

	# word processing
	words = text.split
	words.each { \|word\| word_counts[word] = (word_counts[word] \|\| 0) + 1 }

	# phrase processing - two versions
	# 1. original version with punctuation
	words_with_punct = text.split
	(MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do \|n\|
	next if words_with_punct.size < n
	words_with_punct.each_cons(n) do \|words\|
	phrase = words.join(' ')
	phrase_counts[n][phrase] += 1
	end
	end

	# 2. clean version without punctuation
	clean_words = text.gsub(/[.,!?]/, ' ').split
	(MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do \|n\|
	next if clean_words.size < n
	clean_words.each_cons(n) do \|words\|
	phrase = words.join(' ')
	phrase_counts_clean[n][phrase] += 1
	end
	end

	main_bar.increment!
	end

	# save word counts
	puts "\nSaving word counts..."
	CSV.open(word_csv, 'w') do \|csv\|
	csv << ['word', 'count']
	word_counts.sort_by { \|w,c\| -c }.each { \|w,c\| csv << [w,c] }
	end

	# save cleaned word counts (original functionality)
	cleaned_words = Hash.new(0)
	word_counts.each { \|w,c\| cleaned_words[w.gsub(/,/,'')] += c }

	CSV.open(word_clean_csv, 'w') do \|csv\|
	csv << ['word', 'count']
	cleaned_words.sort_by { \|w,c\| -c }.each { \|w,c\| csv << [w,c] }
	end

	# save phrase counts (with punctuation)
	puts "saving phrase counts (original)..."
	CSV.open(phrase_csv, 'w') do \|csv\|
	csv << ['length', 'phrase', 'count']
	(MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do \|n\|
	phrase_counts[n]
	.select { \|p,c\| c >= MIN_PHRASE_COUNT }
	.sort_by { \|p,c\| -c }
	.each { \|p,c\| csv << [n, p, c] }
	end
	end

	# save cleaned phrase counts (without punctuation)
	puts "Saving phrase counts (no commas)..."
	CSV.open(phrase_clean_csv, 'w') do \|csv\|
	csv << ['length', 'phrase', 'count']
	(MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do \|n\|
	phrase_counts_clean[n]
	.select { \|p,c\| c >= MIN_PHRASE_COUNT }
	.sort_by { \|p,c\| -c }
	.each { \|p,c\| csv << [n, p, c] }
	end
	end

	# final report
	total_phrases = (MIN_PHRASE_LEN..MAX_PHRASE_LEN).sum { \|n\| phrase_counts[n].size }
	total_phrases_clean = (MIN_PHRASE_LEN..MAX_PHRASE_LEN).sum { \|n\| phrase_counts_clean[n].size }

	puts "\nall done !"
	puts "word counts:".ljust(30) + "#{word_counts.size} unique words"
	puts "cleaned words:".ljust(30) + "#{cleaned_words.size} merged entries"
	puts "phrase counts (original):".ljust(30) + "#{total_phrases} phrases"
	puts "phrase counts (no commas):".ljust(30) + "#{total_phrases_clean} phrases"
	puts "files created:".ljust(30) + "#{word_csv}, #{word_clean_csv}, #{phrase_csv}, #{phrase_clean_csv}"
No results found