Created
August 12, 2025 05:34
-
-
Save DanyWallace/9aa36bcca7341fe7e39552d0745eae4d to your computer and use it in GitHub Desktop.
create a zstd dict from random files in a directory and see the results
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env ruby | |
| # Selects X amount of random files from | |
| # ruby zdict.rb inputFolder 2000 --no-compare --quiet --delete-input --delete-output | |
| require 'fileutils' | |
| SOURCE_DIR = (ARGV[0] || ".") | |
| DICT_DIR = "/tmp/dict" | |
| DICT_FILE = "output.dict" | |
| COMPRESSED_DIR = "/tmp/compressed" | |
| SAMPLE_SIZE = (ARGV[1] || 50).to_i | |
| EXT = ".json" | |
| COMPARE = true | |
| QUIET_COMPARE = false | |
| DELETE_INPUT = false | |
| DELETE_OUTPUT = false | |
| # if dir is . that means the dir from which this script is run | |
| if SOURCE_DIR == "." | |
| SOURCE_DIR = Dir.pwd | |
| end | |
| # Check if the source directory exists | |
| unless Dir.exist?(SOURCE_DIR) | |
| abort "ERROR: Source directory '#{SOURCE_DIR}' does not exist." | |
| end | |
| # Check if the sample size is a positive integer | |
| unless SAMPLE_SIZE.is_a?(Integer) && SAMPLE_SIZE > 0 | |
| abort "ERROR: Sample size must be a positive integer." | |
| end | |
| if ARGV.include?("--no-compare") | |
| COMPARE = false | |
| ARGV.delete("--no-compare") | |
| end | |
| if ARGV.include?("--quiet") | |
| QUIET_COMPARE = true | |
| ARGV.delete("--quiet") | |
| end | |
| if ARGV.include?("--delete-input") | |
| DELETE_INPUT = true | |
| ARGV.delete("--delete-input") | |
| end | |
| if ARGV.include?("--delete-output") | |
| DELETE_OUTPUT = true | |
| ARGV.delete("--delete-output") | |
| end | |
| puts "Using sample size: #{SAMPLE_SIZE}" | |
| # 1) Reservoir sample | |
| reservoir = [] | |
| count = 0 | |
| Dir.foreach(SOURCE_DIR) do |entry| | |
| next unless entry.end_with?(EXT) | |
| path = File.join(SOURCE_DIR, entry) | |
| next unless File.file?(path) | |
| if count < SAMPLE_SIZE | |
| reservoir << path | |
| else | |
| j = rand(count + 1) | |
| reservoir[j] = path if j < SAMPLE_SIZE | |
| end | |
| count += 1 | |
| end | |
| if count < SAMPLE_SIZE | |
| abort "ERROR: only found #{count} #{EXT} files in #{SOURCE_DIR}, need at least #{SAMPLE_SIZE}" | |
| end | |
| # prepare dict dir | |
| if Dir.exist?(DICT_DIR) | |
| puts "- Cleaning existing #{DICT_DIR}…" | |
| FileUtils.rm_rf(DICT_DIR) | |
| end | |
| FileUtils.mkdir_p(DICT_DIR) | |
| # copy files | |
| puts "- Copying #{SAMPLE_SIZE} random files to #{DICT_DIR}…" | |
| reservoir.each { |src| FileUtils.cp(src, DICT_DIR) } | |
| # make dict witha a bash command | |
| puts "- Training zstd dictionary…" | |
| train_cmd = "zstd --train #{DICT_DIR}/* -o #{DICT_FILE}" | |
| train_out = `#{train_cmd} 2>&1` | |
| unless $?.success? | |
| abort "zstd training failed!\n#{train_out}" | |
| end | |
| # puts file info | |
| dict_bytes = File.size(DICT_FILE) | |
| dict_kbytes = (dict_bytes.to_f / 1024).round(2) | |
| puts "- #{DICT_FILE} is #{dict_kbytes} KB (#{dict_bytes} bytes)." | |
| # when we dont want to compare | |
| unless COMPARE | |
| exit 0 | |
| else | |
| puts " Compression comparison" | |
| end | |
| # Compress files | |
| puts "- Preparing #{COMPRESSED_DIR}…" | |
| FileUtils.rm_rf(COMPRESSED_DIR) if Dir.exist?(COMPRESSED_DIR) | |
| FileUtils.mkdir_p(COMPRESSED_DIR) | |
| total_orig = 0 | |
| total_new = 0 | |
| # grab all .json files in out2 right now | |
| files_to_compress = Dir.glob("#{SOURCE_DIR}/*#{EXT}") | |
| iterations = 0 | |
| puts "- Compressing #{files_to_compress.size} #{EXT} files…" | |
| files_to_compress.each do |src_path| | |
| if iterations == SAMPLE_SIZE | |
| break | |
| end | |
| entry = File.basename(src_path) | |
| orig_bytes = File.size(src_path) | |
| total_orig += orig_bytes | |
| out_name = "#{entry}.zst" | |
| out_path = File.join(COMPRESSED_DIR, out_name) | |
| cmd = "zstd -q -D #{DICT_FILE} -o #{out_path} #{src_path}" | |
| unless system(cmd) | |
| warn "WARNING: failed to compress #{entry}" | |
| next | |
| end | |
| if DELETE_INPUT | |
| FileUtils.rm(src_path) | |
| end | |
| if DELETE_OUTPUT | |
| FileUtils.rm(out_path) | |
| end | |
| new_bytes = File.size(out_path) | |
| total_new += new_bytes | |
| orig_kb = (orig_bytes / 1024.0).round(2) | |
| new_kb = (new_bytes / 1024.0).round(2) | |
| saved_pct = ((orig_bytes - new_bytes).to_f / orig_bytes * 100).round(2) | |
| puts "Original: #{orig_kb} KB New: #{new_kb} KB Saved: #{saved_pct}% (#{entry})" unless QUIET_COMPARE | |
| iterations += 1 | |
| end | |
| # total/info/summary? | |
| total_saved = total_orig - total_new | |
| total_orig_kb = (total_orig / 1024.0).round(2) | |
| total_new_kb = (total_new / 1024.0).round(2) | |
| total_saved_kb = (total_saved / 1024.0).round(2) | |
| overall_pct = (total_saved.to_f / total_orig * 100).round(2) | |
| puts "-" * 50 | |
| puts "> TOTAL | Original: #{total_orig_kb} KB" | |
| puts "> | Compressed: #{total_new_kb} KB" | |
| puts "> | Saved: #{total_saved_kb} KB (#{overall_pct}%)" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment