Skip to content

Instantly share code, notes, and snippets.

@DanyWallace
Created August 12, 2025 05:34
Show Gist options
  • Select an option

  • Save DanyWallace/9aa36bcca7341fe7e39552d0745eae4d to your computer and use it in GitHub Desktop.

Select an option

Save DanyWallace/9aa36bcca7341fe7e39552d0745eae4d to your computer and use it in GitHub Desktop.
create a zstd dict from random files in a directory and see the results
#!/usr/bin/env ruby
# Selects X amount of random files from
# ruby zdict.rb inputFolder 2000 --no-compare --quiet --delete-input --delete-output
require 'fileutils'
SOURCE_DIR = (ARGV[0] || ".")
DICT_DIR = "/tmp/dict"
DICT_FILE = "output.dict"
COMPRESSED_DIR = "/tmp/compressed"
SAMPLE_SIZE = (ARGV[1] || 50).to_i
EXT = ".json"
COMPARE = true
QUIET_COMPARE = false
DELETE_INPUT = false
DELETE_OUTPUT = false
# if dir is . that means the dir from which this script is run
if SOURCE_DIR == "."
SOURCE_DIR = Dir.pwd
end
# Check if the source directory exists
unless Dir.exist?(SOURCE_DIR)
abort "ERROR: Source directory '#{SOURCE_DIR}' does not exist."
end
# Check if the sample size is a positive integer
unless SAMPLE_SIZE.is_a?(Integer) && SAMPLE_SIZE > 0
abort "ERROR: Sample size must be a positive integer."
end
if ARGV.include?("--no-compare")
COMPARE = false
ARGV.delete("--no-compare")
end
if ARGV.include?("--quiet")
QUIET_COMPARE = true
ARGV.delete("--quiet")
end
if ARGV.include?("--delete-input")
DELETE_INPUT = true
ARGV.delete("--delete-input")
end
if ARGV.include?("--delete-output")
DELETE_OUTPUT = true
ARGV.delete("--delete-output")
end
puts "Using sample size: #{SAMPLE_SIZE}"
# 1) Reservoir sample
reservoir = []
count = 0
Dir.foreach(SOURCE_DIR) do |entry|
next unless entry.end_with?(EXT)
path = File.join(SOURCE_DIR, entry)
next unless File.file?(path)
if count < SAMPLE_SIZE
reservoir << path
else
j = rand(count + 1)
reservoir[j] = path if j < SAMPLE_SIZE
end
count += 1
end
if count < SAMPLE_SIZE
abort "ERROR: only found #{count} #{EXT} files in #{SOURCE_DIR}, need at least #{SAMPLE_SIZE}"
end
# prepare dict dir
if Dir.exist?(DICT_DIR)
puts "- Cleaning existing #{DICT_DIR}…"
FileUtils.rm_rf(DICT_DIR)
end
FileUtils.mkdir_p(DICT_DIR)
# copy files
puts "- Copying #{SAMPLE_SIZE} random files to #{DICT_DIR}…"
reservoir.each { |src| FileUtils.cp(src, DICT_DIR) }
# make dict witha a bash command
puts "- Training zstd dictionary…"
train_cmd = "zstd --train #{DICT_DIR}/* -o #{DICT_FILE}"
train_out = `#{train_cmd} 2>&1`
unless $?.success?
abort "zstd training failed!\n#{train_out}"
end
# puts file info
dict_bytes = File.size(DICT_FILE)
dict_kbytes = (dict_bytes.to_f / 1024).round(2)
puts "- #{DICT_FILE} is #{dict_kbytes} KB (#{dict_bytes} bytes)."
# when we dont want to compare
unless COMPARE
exit 0
else
puts " Compression comparison"
end
# Compress files
puts "- Preparing #{COMPRESSED_DIR}…"
FileUtils.rm_rf(COMPRESSED_DIR) if Dir.exist?(COMPRESSED_DIR)
FileUtils.mkdir_p(COMPRESSED_DIR)
total_orig = 0
total_new = 0
# grab all .json files in out2 right now
files_to_compress = Dir.glob("#{SOURCE_DIR}/*#{EXT}")
iterations = 0
puts "- Compressing #{files_to_compress.size} #{EXT} files…"
files_to_compress.each do |src_path|
if iterations == SAMPLE_SIZE
break
end
entry = File.basename(src_path)
orig_bytes = File.size(src_path)
total_orig += orig_bytes
out_name = "#{entry}.zst"
out_path = File.join(COMPRESSED_DIR, out_name)
cmd = "zstd -q -D #{DICT_FILE} -o #{out_path} #{src_path}"
unless system(cmd)
warn "WARNING: failed to compress #{entry}"
next
end
if DELETE_INPUT
FileUtils.rm(src_path)
end
if DELETE_OUTPUT
FileUtils.rm(out_path)
end
new_bytes = File.size(out_path)
total_new += new_bytes
orig_kb = (orig_bytes / 1024.0).round(2)
new_kb = (new_bytes / 1024.0).round(2)
saved_pct = ((orig_bytes - new_bytes).to_f / orig_bytes * 100).round(2)
puts "Original: #{orig_kb} KB New: #{new_kb} KB Saved: #{saved_pct}% (#{entry})" unless QUIET_COMPARE
iterations += 1
end
# total/info/summary?
total_saved = total_orig - total_new
total_orig_kb = (total_orig / 1024.0).round(2)
total_new_kb = (total_new / 1024.0).round(2)
total_saved_kb = (total_saved / 1024.0).round(2)
overall_pct = (total_saved.to_f / total_orig * 100).round(2)
puts "-" * 50
puts "> TOTAL | Original: #{total_orig_kb} KB"
puts "> | Compressed: #{total_new_kb} KB"
puts "> | Saved: #{total_saved_kb} KB (#{overall_pct}%)"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment