Skip to content

Instantly share code, notes, and snippets.

@SequentialDesign
Last active June 2, 2025 18:51
Show Gist options
  • Select an option

  • Save SequentialDesign/fe289e240cbb0673f3fb64c1beab1210 to your computer and use it in GitHub Desktop.

Select an option

Save SequentialDesign/fe289e240cbb0673f3fb64c1beab1210 to your computer and use it in GitHub Desktop.
word and phrase counter in Ruby
require "csv"
require "progress_bar"
input_file = "movie_subtitles_en_new.txt"
output_dir = "split_files"
Dir.mkdir(output_dir) unless Dir.exist?(output_dir)
# configuration
MIN_PHRASE_LEN = 2
MAX_PHRASE_LEN = 7
MIN_PHRASE_COUNT = 2
# sanitize characters not in UTF-8 by erasing them
santized_input_file = open(input_file, "r") { |io| io.read.encode("UTF-8", invalid: :replace, replace: "") }
puts "splitting input file into sentences..."
text = santized_input_file
.gsub(/[A-Z].*?:/, '') # Remove uppercase words followed by a colon
.gsub(/--/, '') # Remove double hyphens
.gsub(/ - /, '') # Remove double hyphens
.gsub(/\s*'\s*/, "'") # fix inconsistent '
.gsub(/OK/, 'okay') # Replace "OK" with "okay"
.squeeze(' ') # Replace multiple spaces with a single space
.strip # Remove leading/trailing whitespace
.gsub(/\.{3}/, '') # Remove three dots
.gsub(/\++\$\++/, '') # get rid of weird string
sentences = text.downcase.gsub(/all right/, "alright").gsub(/[¡¿"]/, "").split(/\n/)
# initialize progress bar for file splitting
bar = ProgressBar.new(sentences.size / 5_000 + 1)
puts "creating chunk files..."
sentences.each_slice(5_000).with_index do |chunk, index|
File.write("#{output_dir}/chunk_#{index + 1}.txt", chunk.join("\n"))
bar.increment!
end
# files to generate
word_csv = 'word_counts.csv'
word_clean_csv = 'word_counts_no_commas.csv'
phrase_csv = 'phrase_counts.csv'
phrase_clean_csv = 'phrase_counts_no_commas.csv'
# initialize CSVs
[word_csv, phrase_csv, phrase_clean_csv].each do |file|
CSV.open(file, 'w') { |csv| csv << ['length', 'phrase', 'count'] } unless File.exist?(file)
end
puts "processing chunks..."
chunk_files = Dir.children(output_dir)
main_bar = ProgressBar.new(chunk_files.size) # for both processing stages
# initialize counters
word_counts = {}
phrase_counts = Hash.new { |h,k| h[k] = Hash.new(0) }
phrase_counts_clean = Hash.new { |h,k| h[k] = Hash.new(0) }
chunk_files.each do |filename|
chunk_path = "#{output_dir}/#{filename}"
text = File.read(chunk_path)
# word processing
words = text.split
words.each { |word| word_counts[word] = (word_counts[word] || 0) + 1 }
# phrase processing - two versions
# 1. original version with punctuation
words_with_punct = text.split
(MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n|
next if words_with_punct.size < n
words_with_punct.each_cons(n) do |words|
phrase = words.join(' ')
phrase_counts[n][phrase] += 1
end
end
# 2. clean version without punctuation
clean_words = text.gsub(/[.,!?]/, ' ').split
(MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n|
next if clean_words.size < n
clean_words.each_cons(n) do |words|
phrase = words.join(' ')
phrase_counts_clean[n][phrase] += 1
end
end
main_bar.increment!
end
# save word counts
puts "\nSaving word counts..."
CSV.open(word_csv, 'w') do |csv|
csv << ['word', 'count']
word_counts.sort_by { |w,c| -c }.each { |w,c| csv << [w,c] }
end
# save cleaned word counts (original functionality)
cleaned_words = Hash.new(0)
word_counts.each { |w,c| cleaned_words[w.gsub(/,/,'')] += c }
CSV.open(word_clean_csv, 'w') do |csv|
csv << ['word', 'count']
cleaned_words.sort_by { |w,c| -c }.each { |w,c| csv << [w,c] }
end
# save phrase counts (with punctuation)
puts "saving phrase counts (original)..."
CSV.open(phrase_csv, 'w') do |csv|
csv << ['length', 'phrase', 'count']
(MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n|
phrase_counts[n]
.select { |p,c| c >= MIN_PHRASE_COUNT }
.sort_by { |p,c| -c }
.each { |p,c| csv << [n, p, c] }
end
end
# save cleaned phrase counts (without punctuation)
puts "Saving phrase counts (no commas)..."
CSV.open(phrase_clean_csv, 'w') do |csv|
csv << ['length', 'phrase', 'count']
(MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n|
phrase_counts_clean[n]
.select { |p,c| c >= MIN_PHRASE_COUNT }
.sort_by { |p,c| -c }
.each { |p,c| csv << [n, p, c] }
end
end
# final report
total_phrases = (MIN_PHRASE_LEN..MAX_PHRASE_LEN).sum { |n| phrase_counts[n].size }
total_phrases_clean = (MIN_PHRASE_LEN..MAX_PHRASE_LEN).sum { |n| phrase_counts_clean[n].size }
puts "\nall done !"
puts "word counts:".ljust(30) + "#{word_counts.size} unique words"
puts "cleaned words:".ljust(30) + "#{cleaned_words.size} merged entries"
puts "phrase counts (original):".ljust(30) + "#{total_phrases} phrases"
puts "phrase counts (no commas):".ljust(30) + "#{total_phrases_clean} phrases"
puts "files created:".ljust(30) + "#{word_csv}, #{word_clean_csv}, #{phrase_csv}, #{phrase_clean_csv}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment