Created
August 9, 2025 22:03
-
-
Save miminashi/ac0170298ef0b617e9681948623006f7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ubuntu@mi25:~/llama.cpp (master) $ ./build/bin/llama-perplexity --n-gpu-layers 100 --split-mode layer -m ~/.cache/llama.cpp/unsloth_gpt-oss-20b-GGUF_gpt-oss-20b-F16.gguf -f ~/polano.txt | |
| printf '\n' | |
| ./build/bin/llama-perplexity --n-gpu-layers 100 --split-mode layer -m ~/.cache/llama.cpp/unsloth_Qwen3-30B-A3B-Instruct-2507-GGUF_Qwen3-30B-A3B-Instruct-2507-UD-Q8_K_XL.gguf -f ~/polano.txt | |
| printf '\n' | |
| ./build/bin/llama-perplexity --n-gpu-layers 100 --split-mode layer -m ~/.cache/llama.cpp/unsloth_gemma-3-27b-it-GGUF_gemma-3-27b-it-UD-Q8_K_XL.gguf -f ~/polano.txt | |
| ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no | |
| ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no | |
| ggml_cuda_init: found 4 ROCm devices: | |
| Device 0: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| Device 1: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| Device 2: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| Device 3: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| build: 6112 (99acbc99) with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu | |
| llama_model_load_from_file_impl: using device ROCm0 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_load_from_file_impl: using device ROCm1 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_load_from_file_impl: using device ROCm2 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_load_from_file_impl: using device ROCm3 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_loader: loaded meta data with 37 key-value pairs and 459 tensors from /home/ubuntu/.cache/llama.cpp/unsloth_gpt-oss-20b-GGUF_gpt-oss-20b-F16.gguf (version GGUF V3 (latest)) | |
| llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. | |
| llama_model_loader: - kv 0: general.architecture str = gpt-oss | |
| llama_model_loader: - kv 1: general.type str = model | |
| llama_model_loader: - kv 2: general.name str = Gpt-Oss-20B | |
| llama_model_loader: - kv 3: general.basename str = Gpt-Oss-20B | |
| llama_model_loader: - kv 4: general.quantized_by str = Unsloth | |
| llama_model_loader: - kv 5: general.size_label str = 20B | |
| llama_model_loader: - kv 6: general.license str = apache-2.0 | |
| llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth | |
| llama_model_loader: - kv 8: general.tags arr[str,2] = ["vllm", "text-generation"] | |
| llama_model_loader: - kv 9: gpt-oss.block_count u32 = 24 | |
| llama_model_loader: - kv 10: gpt-oss.context_length u32 = 131072 | |
| llama_model_loader: - kv 11: gpt-oss.embedding_length u32 = 2880 | |
| llama_model_loader: - kv 12: gpt-oss.feed_forward_length u32 = 2880 | |
| llama_model_loader: - kv 13: gpt-oss.attention.head_count u32 = 64 | |
| llama_model_loader: - kv 14: gpt-oss.attention.head_count_kv u32 = 8 | |
| llama_model_loader: - kv 15: gpt-oss.rope.freq_base f32 = 150000.000000 | |
| llama_model_loader: - kv 16: gpt-oss.attention.layer_norm_rms_epsilon f32 = 0.000010 | |
| llama_model_loader: - kv 17: gpt-oss.expert_count u32 = 32 | |
| llama_model_loader: - kv 18: gpt-oss.expert_used_count u32 = 4 | |
| llama_model_loader: - kv 19: gpt-oss.attention.key_length u32 = 64 | |
| llama_model_loader: - kv 20: gpt-oss.attention.value_length u32 = 64 | |
| llama_model_loader: - kv 21: general.file_type u32 = 1 | |
| llama_model_loader: - kv 22: gpt-oss.attention.sliding_window u32 = 128 | |
| llama_model_loader: - kv 23: gpt-oss.expert_feed_forward_length u32 = 2880 | |
| llama_model_loader: - kv 24: gpt-oss.rope.scaling.type str = yarn | |
| llama_model_loader: - kv 25: gpt-oss.rope.scaling.factor f32 = 32.000000 | |
| llama_model_loader: - kv 26: gpt-oss.rope.scaling.original_context_length u32 = 4096 | |
| llama_model_loader: - kv 27: general.quantization_version u32 = 2 | |
| llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2 | |
| llama_model_loader: - kv 29: tokenizer.ggml.pre str = gpt-4o | |
| llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,201088] = ["!", "\"", "#", "$", "%", "&", "'", ... | |
| llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,201088] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | |
| llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,446189] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... | |
| llama_model_loader: - kv 33: tokenizer.ggml.bos_token_id u32 = 199998 | |
| llama_model_loader: - kv 34: tokenizer.ggml.eos_token_id u32 = 200002 | |
| llama_model_loader: - kv 35: tokenizer.ggml.padding_token_id u32 = 200017 | |
| llama_model_loader: - kv 36: tokenizer.chat_template str = {# Copyright 2025-present Unsloth. Ap... | |
| llama_model_loader: - type f32: 289 tensors | |
| llama_model_loader: - type f16: 98 tensors | |
| llama_model_loader: - type mxfp4: 72 tensors | |
| print_info: file format = GGUF V3 (latest) | |
| print_info: file type = F16 | |
| print_info: file size = 12.83 GiB (5.27 BPW) | |
| load: printing all EOG tokens: | |
| load: - 199999 ('<|endoftext|>') | |
| load: - 200002 ('<|return|>') | |
| load: - 200007 ('<|end|>') | |
| load: - 200012 ('<|call|>') | |
| load: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list | |
| load: special tokens cache size = 21 | |
| load: token to piece cache size = 1.3332 MB | |
| print_info: arch = gpt-oss | |
| print_info: vocab_only = 0 | |
| print_info: n_ctx_train = 131072 | |
| print_info: n_embd = 2880 | |
| print_info: n_layer = 24 | |
| print_info: n_head = 64 | |
| print_info: n_head_kv = 8 | |
| print_info: n_rot = 64 | |
| print_info: n_swa = 128 | |
| print_info: is_swa_any = 1 | |
| print_info: n_embd_head_k = 64 | |
| print_info: n_embd_head_v = 64 | |
| print_info: n_gqa = 8 | |
| print_info: n_embd_k_gqa = 512 | |
| print_info: n_embd_v_gqa = 512 | |
| print_info: f_norm_eps = 0.0e+00 | |
| print_info: f_norm_rms_eps = 1.0e-05 | |
| print_info: f_clamp_kqv = 0.0e+00 | |
| print_info: f_max_alibi_bias = 0.0e+00 | |
| print_info: f_logit_scale = 0.0e+00 | |
| print_info: f_attn_scale = 0.0e+00 | |
| print_info: n_ff = 2880 | |
| print_info: n_expert = 32 | |
| print_info: n_expert_used = 4 | |
| print_info: causal attn = 1 | |
| print_info: pooling type = 0 | |
| print_info: rope type = 2 | |
| print_info: rope scaling = yarn | |
| print_info: freq_base_train = 150000.0 | |
| print_info: freq_scale_train = 0.03125 | |
| print_info: n_ctx_orig_yarn = 4096 | |
| print_info: rope_finetuned = unknown | |
| print_info: model type = ?B | |
| print_info: model params = 20.91 B | |
| print_info: general.name = Gpt-Oss-20B | |
| print_info: n_ff_exp = 2880 | |
| print_info: vocab type = BPE | |
| print_info: n_vocab = 201088 | |
| print_info: n_merges = 446189 | |
| print_info: BOS token = 199998 '<|startoftext|>' | |
| print_info: EOS token = 200002 '<|return|>' | |
| print_info: EOT token = 199999 '<|endoftext|>' | |
| print_info: PAD token = 200017 '<|reserved_200017|>' | |
| print_info: LF token = 198 'Ċ' | |
| print_info: EOG token = 199999 '<|endoftext|>' | |
| print_info: EOG token = 200002 '<|return|>' | |
| print_info: EOG token = 200012 '<|call|>' | |
| print_info: max token length = 256 | |
| load_tensors: loading model tensors, this can take a while... (mmap = true) | |
| load_tensors: offloading 24 repeating layers to GPU | |
| load_tensors: offloading output layer to GPU | |
| load_tensors: offloaded 25/25 layers to GPU | |
| load_tensors: ROCm0 model buffer size = 3188.52 MiB | |
| load_tensors: ROCm1 model buffer size = 2733.01 MiB | |
| load_tensors: ROCm2 model buffer size = 2733.01 MiB | |
| load_tensors: ROCm3 model buffer size = 3382.13 MiB | |
| load_tensors: CPU_Mapped model buffer size = 1104.61 MiB | |
| ................................................................................... | |
| llama_context: constructing llama_context | |
| llama_context: n_seq_max = 4 | |
| llama_context: n_ctx = 2048 | |
| llama_context: n_ctx_per_seq = 512 | |
| llama_context: n_batch = 2048 | |
| llama_context: n_ubatch = 512 | |
| llama_context: causal_attn = 1 | |
| llama_context: flash_attn = 0 | |
| llama_context: kv_unified = false | |
| llama_context: freq_base = 150000.0 | |
| llama_context: freq_scale = 0.03125 | |
| llama_context: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized | |
| llama_context: requested n_seq_max (4) > 1, but swa_full is not enabled -- performance may be degraded: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 | |
| llama_context: ROCm_Host output buffer size = 3.07 MiB | |
| llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 512 cells | |
| llama_kv_cache_unified: ROCm0 KV buffer size = 12.00 MiB | |
| llama_kv_cache_unified: ROCm1 KV buffer size = 12.00 MiB | |
| llama_kv_cache_unified: ROCm2 KV buffer size = 12.00 MiB | |
| llama_kv_cache_unified: ROCm3 KV buffer size = 12.00 MiB | |
| llama_kv_cache_unified: size = 48.00 MiB ( 512 cells, 12 layers, 4/4 seqs), K (f16): 24.00 MiB, V (f16): 24.00 MiB | |
| llama_kv_cache_unified_iswa: creating SWA KV cache, size = 512 cells | |
| llama_kv_cache_unified: ROCm0 KV buffer size = 16.00 MiB | |
| llama_kv_cache_unified: ROCm1 KV buffer size = 12.00 MiB | |
| llama_kv_cache_unified: ROCm2 KV buffer size = 12.00 MiB | |
| llama_kv_cache_unified: ROCm3 KV buffer size = 8.00 MiB | |
| llama_kv_cache_unified: size = 48.00 MiB ( 512 cells, 12 layers, 4/4 seqs), K (f16): 24.00 MiB, V (f16): 24.00 MiB | |
| llama_context: pipeline parallelism enabled (n_copies=4) | |
| llama_context: ROCm0 compute buffer size = 137.79 MiB | |
| llama_context: ROCm1 compute buffer size = 137.79 MiB | |
| llama_context: ROCm2 compute buffer size = 137.79 MiB | |
| llama_context: ROCm3 compute buffer size = 444.92 MiB | |
| llama_context: ROCm_Host compute buffer size = 29.68 MiB | |
| llama_context: graph nodes = 1446 | |
| llama_context: graph splits = 5 | |
| common_init_from_params: added <|endoftext|> logit bias = -inf | |
| common_init_from_params: added <|return|> logit bias = -inf | |
| common_init_from_params: added <|call|> logit bias = -inf | |
| common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048 | |
| common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) | |
| system_info: n_threads = 12 (n_threads_batch = 12) / 24 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | | |
| perplexity: tokenizing the input .. | |
| perplexity: tokenization took 110.99 ms | |
| perplexity: calculating perplexity over 70 chunks, n_ctx=512, batch_size=2048, n_seq=4 | |
| perplexity: 7.18 seconds per pass - ETA 2.08 minutes | |
| [1]112.9377,[2]122.1352,[3]100.6324,[4]93.0957,[5]89.7141,[6]90.4398,[7]91.6406,[8]92.1841,[9]89.5061,[10]90.4190,[11]89.3026,[12]89.9131,[13]88.5398,[14]89.1334,[15]88.7462,[16]88.9980,[17]91.1456,[18]90.6418,[19]91.2020,[20]89.6155,[21]88.7622,[22]89.3350,[23]90.7457,[24]90.6350,[25]90.4557,[26]89.7106,[27]89.7938,[28]91.0184,[29]91.4180,[30]90.7831,[31]90.9751,[32]91.4860,[33]92.0903,[34]92.0968,[35]91.1305,[36]90.8390,[37]90.3689,[38]90.5001,[39]90.0757,[40]89.9540,[41]89.5802,[42]90.1338,[43]90.4896,[44]89.7632,[45]89.6120,[46]89.8986,[47]89.9305,[48]90.4410,[49]90.1637,[50]90.2724,[51]90.7497,[52]90.0331,[53]90.1321,[54]90.2238,[55]90.0770,[56]89.4811,[57]90.0112,[58]90.5232,[59]90.5199,[60]90.4949,[61]90.7617,[62]90.6152,[63]90.5651,[64]90.4227,[65]90.2705,[66]90.3151,[67]89.8836,[68]90.2719,[69]90.5306,[70]90.8771, | |
| Final estimate: PPL = 90.8771 +/- 1.61906 | |
| llama_perf_context_print: load time = 19779.11 ms | |
| llama_perf_context_print: prompt eval time = 104807.30 ms / 35840 tokens ( 2.92 ms per token, 341.96 tokens per second) | |
| llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) | |
| llama_perf_context_print: total time = 109861.73 ms / 35841 tokens | |
| llama_perf_context_print: graphs reused = 0 | |
| ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no | |
| ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no | |
| ggml_cuda_init: found 4 ROCm devices: | |
| Device 0: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| Device 1: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| Device 2: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| Device 3: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| build: 6112 (99acbc99) with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu | |
| llama_model_load_from_file_impl: using device ROCm0 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_load_from_file_impl: using device ROCm1 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_load_from_file_impl: using device ROCm2 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_load_from_file_impl: using device ROCm3 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_loader: loaded meta data with 45 key-value pairs and 579 tensors from /home/ubuntu/.cache/llama.cpp/unsloth_Qwen3-30B-A3B-Instruct-2507-GGUF_Qwen3-30B-A3B-Instruct-2507-UD-Q8_K_XL.gguf (version GGUF V3 (latest)) | |
| llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. | |
| llama_model_loader: - kv 0: general.architecture str = qwen3moe | |
| llama_model_loader: - kv 1: general.type str = model | |
| llama_model_loader: - kv 2: general.name str = Qwen3-30B-A3B-Instruct-2507 | |
| llama_model_loader: - kv 3: general.version str = 2507 | |
| llama_model_loader: - kv 4: general.finetune str = Instruct | |
| llama_model_loader: - kv 5: general.basename str = Qwen3-30B-A3B-Instruct-2507 | |
| llama_model_loader: - kv 6: general.quantized_by str = Unsloth | |
| llama_model_loader: - kv 7: general.size_label str = 30B-A3B | |
| llama_model_loader: - kv 8: general.license str = apache-2.0 | |
| llama_model_loader: - kv 9: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B... | |
| llama_model_loader: - kv 10: general.repo_url str = https://huggingface.co/unsloth | |
| llama_model_loader: - kv 11: general.base_model.count u32 = 1 | |
| llama_model_loader: - kv 12: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507 | |
| llama_model_loader: - kv 13: general.base_model.0.version str = 2507 | |
| llama_model_loader: - kv 14: general.base_model.0.organization str = Qwen | |
| llama_model_loader: - kv 15: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B... | |
| llama_model_loader: - kv 16: general.tags arr[str,2] = ["unsloth", "text-generation"] | |
| llama_model_loader: - kv 17: qwen3moe.block_count u32 = 48 | |
| llama_model_loader: - kv 18: qwen3moe.context_length u32 = 262144 | |
| llama_model_loader: - kv 19: qwen3moe.embedding_length u32 = 2048 | |
| llama_model_loader: - kv 20: qwen3moe.feed_forward_length u32 = 6144 | |
| llama_model_loader: - kv 21: qwen3moe.attention.head_count u32 = 32 | |
| llama_model_loader: - kv 22: qwen3moe.attention.head_count_kv u32 = 4 | |
| llama_model_loader: - kv 23: qwen3moe.rope.freq_base f32 = 10000000.000000 | |
| llama_model_loader: - kv 24: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 | |
| llama_model_loader: - kv 25: qwen3moe.expert_used_count u32 = 8 | |
| llama_model_loader: - kv 26: qwen3moe.attention.key_length u32 = 128 | |
| llama_model_loader: - kv 27: qwen3moe.attention.value_length u32 = 128 | |
| llama_model_loader: - kv 28: qwen3moe.expert_count u32 = 128 | |
| llama_model_loader: - kv 29: qwen3moe.expert_feed_forward_length u32 = 768 | |
| llama_model_loader: - kv 30: tokenizer.ggml.model str = gpt2 | |
| llama_model_loader: - kv 31: tokenizer.ggml.pre str = qwen2 | |
| llama_model_loader: - kv 32: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... | |
| llama_model_loader: - kv 33: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | |
| llama_model_loader: - kv 34: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... | |
| llama_model_loader: - kv 35: tokenizer.ggml.eos_token_id u32 = 151645 | |
| llama_model_loader: - kv 36: tokenizer.ggml.padding_token_id u32 = 151654 | |
| llama_model_loader: - kv 37: tokenizer.ggml.add_bos_token bool = false | |
| llama_model_loader: - kv 38: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... | |
| llama_model_loader: - kv 39: general.quantization_version u32 = 2 | |
| llama_model_loader: - kv 40: general.file_type u32 = 7 | |
| llama_model_loader: - kv 41: quantize.imatrix.file str = Qwen3-30B-A3B-Instruct-2507-GGUF/imat... | |
| llama_model_loader: - kv 42: quantize.imatrix.dataset str = unsloth_calibration_Qwen3-30B-A3B-Ins... | |
| llama_model_loader: - kv 43: quantize.imatrix.entries_count u32 = 384 | |
| llama_model_loader: - kv 44: quantize.imatrix.chunks_count u32 = 693 | |
| llama_model_loader: - type f32: 241 tensors | |
| llama_model_loader: - type f16: 75 tensors | |
| llama_model_loader: - type q8_0: 263 tensors | |
| print_info: file format = GGUF V3 (latest) | |
| print_info: file type = Q8_0 | |
| print_info: file size = 33.51 GiB (9.43 BPW) | |
| load: printing all EOG tokens: | |
| load: - 151643 ('<|endoftext|>') | |
| load: - 151645 ('<|im_end|>') | |
| load: - 151662 ('<|fim_pad|>') | |
| load: - 151663 ('<|repo_name|>') | |
| load: - 151664 ('<|file_sep|>') | |
| load: special tokens cache size = 26 | |
| load: token to piece cache size = 0.9311 MB | |
| print_info: arch = qwen3moe | |
| print_info: vocab_only = 0 | |
| print_info: n_ctx_train = 262144 | |
| print_info: n_embd = 2048 | |
| print_info: n_layer = 48 | |
| print_info: n_head = 32 | |
| print_info: n_head_kv = 4 | |
| print_info: n_rot = 128 | |
| print_info: n_swa = 0 | |
| print_info: is_swa_any = 0 | |
| print_info: n_embd_head_k = 128 | |
| print_info: n_embd_head_v = 128 | |
| print_info: n_gqa = 8 | |
| print_info: n_embd_k_gqa = 512 | |
| print_info: n_embd_v_gqa = 512 | |
| print_info: f_norm_eps = 0.0e+00 | |
| print_info: f_norm_rms_eps = 1.0e-06 | |
| print_info: f_clamp_kqv = 0.0e+00 | |
| print_info: f_max_alibi_bias = 0.0e+00 | |
| print_info: f_logit_scale = 0.0e+00 | |
| print_info: f_attn_scale = 0.0e+00 | |
| print_info: n_ff = 6144 | |
| print_info: n_expert = 128 | |
| print_info: n_expert_used = 8 | |
| print_info: causal attn = 1 | |
| print_info: pooling type = 0 | |
| print_info: rope type = 2 | |
| print_info: rope scaling = linear | |
| print_info: freq_base_train = 10000000.0 | |
| print_info: freq_scale_train = 1 | |
| print_info: n_ctx_orig_yarn = 262144 | |
| print_info: rope_finetuned = unknown | |
| print_info: model type = 30B.A3B | |
| print_info: model params = 30.53 B | |
| print_info: general.name = Qwen3-30B-A3B-Instruct-2507 | |
| print_info: n_ff_exp = 768 | |
| print_info: vocab type = BPE | |
| print_info: n_vocab = 151936 | |
| print_info: n_merges = 151387 | |
| print_info: BOS token = 11 ',' | |
| print_info: EOS token = 151645 '<|im_end|>' | |
| print_info: EOT token = 151645 '<|im_end|>' | |
| print_info: PAD token = 151654 '<|vision_pad|>' | |
| print_info: LF token = 198 'Ċ' | |
| print_info: FIM PRE token = 151659 '<|fim_prefix|>' | |
| print_info: FIM SUF token = 151661 '<|fim_suffix|>' | |
| print_info: FIM MID token = 151660 '<|fim_middle|>' | |
| print_info: FIM PAD token = 151662 '<|fim_pad|>' | |
| print_info: FIM REP token = 151663 '<|repo_name|>' | |
| print_info: FIM SEP token = 151664 '<|file_sep|>' | |
| print_info: EOG token = 151643 '<|endoftext|>' | |
| print_info: EOG token = 151645 '<|im_end|>' | |
| print_info: EOG token = 151662 '<|fim_pad|>' | |
| print_info: EOG token = 151663 '<|repo_name|>' | |
| print_info: EOG token = 151664 '<|file_sep|>' | |
| print_info: max token length = 256 | |
| load_tensors: loading model tensors, this can take a while... (mmap = true) | |
| load_tensors: offloading 48 repeating layers to GPU | |
| load_tensors: offloading output layer to GPU | |
| load_tensors: offloaded 49/49 layers to GPU | |
| load_tensors: ROCm0 model buffer size = 9875.34 MiB | |
| load_tensors: ROCm1 model buffer size = 7596.95 MiB | |
| load_tensors: ROCm2 model buffer size = 7596.95 MiB | |
| load_tensors: ROCm3 model buffer size = 8654.26 MiB | |
| load_tensors: CPU_Mapped model buffer size = 593.50 MiB | |
| ................................................................................................. | |
| llama_context: constructing llama_context | |
| llama_context: n_seq_max = 4 | |
| llama_context: n_ctx = 2048 | |
| llama_context: n_ctx_per_seq = 512 | |
| llama_context: n_batch = 2048 | |
| llama_context: n_ubatch = 512 | |
| llama_context: causal_attn = 1 | |
| llama_context: flash_attn = 0 | |
| llama_context: kv_unified = false | |
| llama_context: freq_base = 10000000.0 | |
| llama_context: freq_scale = 1 | |
| llama_context: n_ctx_per_seq (512) < n_ctx_train (262144) -- the full capacity of the model will not be utilized | |
| llama_context: ROCm_Host output buffer size = 2.32 MiB | |
| llama_kv_cache_unified: ROCm0 KV buffer size = 52.00 MiB | |
| llama_kv_cache_unified: ROCm1 KV buffer size = 48.00 MiB | |
| llama_kv_cache_unified: ROCm2 KV buffer size = 48.00 MiB | |
| llama_kv_cache_unified: ROCm3 KV buffer size = 44.00 MiB | |
| llama_kv_cache_unified: size = 192.00 MiB ( 512 cells, 48 layers, 4/4 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB | |
| llama_context: pipeline parallelism enabled (n_copies=4) | |
| llama_context: ROCm0 compute buffer size = 104.52 MiB | |
| llama_context: ROCm1 compute buffer size = 104.52 MiB | |
| llama_context: ROCm2 compute buffer size = 104.52 MiB | |
| llama_context: ROCm3 compute buffer size = 328.78 MiB | |
| llama_context: ROCm_Host compute buffer size = 16.04 MiB | |
| llama_context: graph nodes = 3174 | |
| llama_context: graph splits = 5 | |
| common_init_from_params: added <|endoftext|> logit bias = -inf | |
| common_init_from_params: added <|im_end|> logit bias = -inf | |
| common_init_from_params: added <|fim_pad|> logit bias = -inf | |
| common_init_from_params: added <|repo_name|> logit bias = -inf | |
| common_init_from_params: added <|file_sep|> logit bias = -inf | |
| common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048 | |
| common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) | |
| system_info: n_threads = 12 (n_threads_batch = 12) / 24 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | | |
| perplexity: tokenizing the input .. | |
| perplexity: tokenization took 93.719 ms | |
| perplexity: calculating perplexity over 62 chunks, n_ctx=512, batch_size=2048, n_seq=4 | |
| perplexity: 9.06 seconds per pass - ETA 2.33 minutes | |
| [1]17.8927,[2]20.1188,[3]17.5880,[4]15.7937,[5]15.1505,[6]14.3488,[7]14.3189,[8]14.0705,[9]14.0010,[10]13.7464,[11]13.9477,[12]14.2810,[13]14.2990,[14]14.0895,[15]14.3763,[16]14.2257,[17]14.3015,[18]14.1060,[19]14.0180,[20]14.5269,[21]14.6234,[22]14.5623,[23]14.5912,[24]14.8327,[25]14.5430,[26]14.6653,[27]14.3976,[28]14.4389,[29]14.1773,[30]14.0281,[31]13.8505,[32]13.9944,[33]14.2497,[34]14.1862,[35]14.2975,[36]14.1557,[37]14.1691,[38]14.2407,[39]14.3550,[40]14.4891,[41]14.4864,[42]14.4668,[43]14.3618,[44]14.4407,[45]14.4565,[46]14.3720,[47]14.5502,[48]14.4015,[49]14.4064,[50]14.3448,[51]14.3075,[52]14.3204,[53]14.3594,[54]14.3976,[55]14.3658,[56]14.5180,[57]14.6384,[58]14.7600,[59]14.8412,[60]14.7975,[61]14.6323,[62]14.7477, | |
| Final estimate: PPL = 14.7477 +/- 0.35399 | |
| llama_perf_context_print: load time = 46797.67 ms | |
| llama_perf_context_print: prompt eval time = 113468.18 ms / 31744 tokens ( 3.57 ms per token, 279.76 tokens per second) | |
| llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) | |
| llama_perf_context_print: total time = 117263.57 ms / 31745 tokens | |
| llama_perf_context_print: graphs reused = 0 | |
| ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no | |
| ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no | |
| ggml_cuda_init: found 4 ROCm devices: | |
| Device 0: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| Device 1: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| Device 2: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| Device 3: Radeon Instinct MI25, gfx900:xnack- (0x900), VMM: no, Wave Size: 64 | |
| build: 6112 (99acbc99) with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu | |
| llama_model_load_from_file_impl: using device ROCm0 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_load_from_file_impl: using device ROCm1 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_load_from_file_impl: using device ROCm2 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_load_from_file_impl: using device ROCm3 (Radeon Instinct MI25) - 16352 MiB free | |
| llama_model_loader: loaded meta data with 40 key-value pairs and 808 tensors from /home/ubuntu/.cache/llama.cpp/unsloth_gemma-3-27b-it-GGUF_gemma-3-27b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest)) | |
| llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. | |
| llama_model_loader: - kv 0: general.architecture str = gemma3 | |
| llama_model_loader: - kv 1: general.type str = model | |
| llama_model_loader: - kv 2: general.name str = Gemma-3-27B-It | |
| llama_model_loader: - kv 3: general.finetune str = it | |
| llama_model_loader: - kv 4: general.basename str = Gemma-3-27B-It | |
| llama_model_loader: - kv 5: general.quantized_by str = Unsloth | |
| llama_model_loader: - kv 6: general.size_label str = 27B | |
| llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth | |
| llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 | |
| llama_model_loader: - kv 9: gemma3.embedding_length u32 = 5376 | |
| llama_model_loader: - kv 10: gemma3.block_count u32 = 62 | |
| llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 21504 | |
| llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 32 | |
| llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 | |
| llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 128 | |
| llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 128 | |
| llama_model_loader: - kv 16: gemma3.rope.freq_base f32 = 1000000.000000 | |
| llama_model_loader: - kv 17: gemma3.attention.sliding_window u32 = 1024 | |
| llama_model_loader: - kv 18: gemma3.attention.head_count_kv u32 = 16 | |
| llama_model_loader: - kv 19: gemma3.rope.scaling.type str = linear | |
| llama_model_loader: - kv 20: gemma3.rope.scaling.factor f32 = 8.000000 | |
| llama_model_loader: - kv 21: tokenizer.ggml.model str = llama | |
| llama_model_loader: - kv 22: tokenizer.ggml.pre str = default | |
| llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,262208] = ["<pad>", "<eos>", "<bos>", "<unk>", ... | |
| llama_model_loader: - kv 24: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... | |
| llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... | |
| llama_model_loader: - kv 26: tokenizer.ggml.bos_token_id u32 = 2 | |
| llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 106 | |
| llama_model_loader: - kv 28: tokenizer.ggml.unknown_token_id u32 = 3 | |
| llama_model_loader: - kv 29: tokenizer.ggml.padding_token_id u32 = 0 | |
| llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true | |
| llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = false | |
| llama_model_loader: - kv 32: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... | |
| llama_model_loader: - kv 33: tokenizer.ggml.add_space_prefix bool = false | |
| llama_model_loader: - kv 34: general.quantization_version u32 = 2 | |
| llama_model_loader: - kv 35: general.file_type u32 = 7 | |
| llama_model_loader: - kv 36: quantize.imatrix.file str = gemma-3-27b-it-GGUF/imatrix_unsloth.dat | |
| llama_model_loader: - kv 37: quantize.imatrix.dataset str = unsloth_calibration_gemma-3-27b-it.txt | |
| llama_model_loader: - kv 38: quantize.imatrix.entries_count i32 = 434 | |
| llama_model_loader: - kv 39: quantize.imatrix.chunks_count i32 = 663 | |
| llama_model_loader: - type f32: 373 tensors | |
| llama_model_loader: - type q8_0: 409 tensors | |
| llama_model_loader: - type bf16: 26 tensors | |
| print_info: file format = GGUF V3 (latest) | |
| print_info: file type = Q8_0 | |
| print_info: file size = 29.62 GiB (9.42 BPW) | |
| load: printing all EOG tokens: | |
| load: - 106 ('<end_of_turn>') | |
| load: special tokens cache size = 6415 | |
| load: token to piece cache size = 1.9446 MB | |
| print_info: arch = gemma3 | |
| print_info: vocab_only = 0 | |
| print_info: n_ctx_train = 131072 | |
| print_info: n_embd = 5376 | |
| print_info: n_layer = 62 | |
| print_info: n_head = 32 | |
| print_info: n_head_kv = 16 | |
| print_info: n_rot = 128 | |
| print_info: n_swa = 1024 | |
| print_info: is_swa_any = 1 | |
| print_info: n_embd_head_k = 128 | |
| print_info: n_embd_head_v = 128 | |
| print_info: n_gqa = 2 | |
| print_info: n_embd_k_gqa = 2048 | |
| print_info: n_embd_v_gqa = 2048 | |
| print_info: f_norm_eps = 0.0e+00 | |
| print_info: f_norm_rms_eps = 1.0e-06 | |
| print_info: f_clamp_kqv = 0.0e+00 | |
| print_info: f_max_alibi_bias = 0.0e+00 | |
| print_info: f_logit_scale = 0.0e+00 | |
| print_info: f_attn_scale = 7.7e-02 | |
| print_info: n_ff = 21504 | |
| print_info: n_expert = 0 | |
| print_info: n_expert_used = 0 | |
| print_info: causal attn = 1 | |
| print_info: pooling type = 0 | |
| print_info: rope type = 2 | |
| print_info: rope scaling = linear | |
| print_info: freq_base_train = 1000000.0 | |
| print_info: freq_scale_train = 0.125 | |
| print_info: n_ctx_orig_yarn = 131072 | |
| print_info: rope_finetuned = unknown | |
| print_info: model type = 27B | |
| print_info: model params = 27.01 B | |
| print_info: general.name = Gemma-3-27B-It | |
| print_info: vocab type = SPM | |
| print_info: n_vocab = 262208 | |
| print_info: n_merges = 0 | |
| print_info: BOS token = 2 '<bos>' | |
| print_info: EOS token = 106 '<end_of_turn>' | |
| print_info: EOT token = 106 '<end_of_turn>' | |
| print_info: UNK token = 3 '<unk>' | |
| print_info: PAD token = 0 '<pad>' | |
| print_info: LF token = 248 '<0x0A>' | |
| print_info: EOG token = 106 '<end_of_turn>' | |
| print_info: max token length = 48 | |
| load_tensors: loading model tensors, this can take a while... (mmap = true) | |
| load_tensors: offloading 62 repeating layers to GPU | |
| load_tensors: offloading output layer to GPU | |
| load_tensors: offloaded 63/63 layers to GPU | |
| load_tensors: ROCm0 model buffer size = 8053.54 MiB | |
| load_tensors: ROCm1 model buffer size = 6695.11 MiB | |
| load_tensors: ROCm2 model buffer size = 6695.11 MiB | |
| load_tensors: ROCm3 model buffer size = 8886.50 MiB | |
| load_tensors: CPU_Mapped model buffer size = 2688.66 MiB | |
| .................................................................................... | |
| llama_context: constructing llama_context | |
| llama_context: n_seq_max = 4 | |
| llama_context: n_ctx = 2048 | |
| llama_context: n_ctx_per_seq = 512 | |
| llama_context: n_batch = 2048 | |
| llama_context: n_ubatch = 512 | |
| llama_context: causal_attn = 1 | |
| llama_context: flash_attn = 0 | |
| llama_context: kv_unified = false | |
| llama_context: freq_base = 1000000.0 | |
| llama_context: freq_scale = 0.125 | |
| llama_context: n_ctx_per_seq (512) < n_ctx_trahttps://deepwiki.com/search/perplexity_2383b2da-cc6a-4076-9b9e-213af193691ein (131072) -- the full capacity of the model will not be utilized | |
| llama_context: requested n_seq_max (4) > 1, but swa_full is not enabled -- performance may be degraded: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 | |
| llama_context: ROCm_Host output buffer size = 4.00 MiB | |
| llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 512 cells | |
| llama_kv_cache_unified: ROCm0 KV buffer size = 32.00 MiB | |
| llama_kv_cache_unified: ROCm1 KV buffer size = 48.00 MiB | |
| llama_kv_cache_unified: ROCm2 KV buffer size = 48.00 MiB | |
| llama_kv_cache_unified: ROCm3 KV buffer size = 32.00 MiB | |
| llama_kv_cache_unified: size = 160.00 MiB ( 512 cells, 10 layers, 4/4 seqs), K (f16): 80.00 MiB, V (f16): 80.00 MiB | |
| llama_kv_cache_unified_iswa: creating SWA KV cache, size = 512 cells | |
| llama_kv_cache_unified: ROCm0 KV buffer size = 224.00 MiB | |
| llama_kv_cache_unified: ROCm1 KV buffer size = 208.00 MiB | |
| llama_kv_cache_unified: ROCm2 KV buffer size = 208.00 MiB | |
| llama_kv_cache_unified: ROCm3 KV buffer size = 192.00 MiB | |
| llama_kv_cache_unified: size = 832.00 MiB ( 512 cells, 52 layers, 4/4 seqs), K (f16): 416.00 MiB, V (f16): 416.00 MiB | |
| llama_context: pipeline parallelism enabled (n_copies=4) | |
| llama_context: ROCm0 compute buffer size = 277.04 MiB | |
| llama_context: ROCm1 compute buffer size = 277.04 MiB | |
| llama_context: ROCm2 compute buffer size = 277.04 MiB | |
| llama_context: ROCm3 compute buffer size = 636.67 MiB | |
| llama_context: ROCm_Host compute buffer size = 82.55 MiB | |
| llama_context: graph nodes = 2735 | |
| llama_context: graph splits = 5 | |
| common_init_from_params: added <end_of_turn> logit bias = -inf | |
| common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048 | |
| common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) | |
| system_info: n_threads = 12 (n_threads_batch = 12) / 24 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | | |
| perplexity: tokenizing the input .. | |
| perplexity: tokenization took 38.711 ms | |
| perplexity: calculating perplexity over 58 chunks, n_ctx=512, batch_size=2048, n_seq=4 | |
| perplexity: 31.82 seconds per pass - ETA 7.68 minutes | |
| [1]26.7955,[2]31.6588,[3]24.8119,[4]22.6109,[5]23.1167,[6]23.3437,[7]22.9963,[8]22.8649,[9]21.0063,[10]21.1354,[11]22.2279,[12]21.7067,[13]21.2408,[14]21.4859,[15]20.6131,[16]20.8148,[17]21.3397,[18]21.2674,[19]21.5771,[20]21.3695,[21]20.9652,[22]21.0828,[23]20.9136,[24]21.0333,[25]20.5873,[26]21.1063,[27]20.6983,[28]20.7043,[29]20.3499,[30]20.5463,[31]20.8592,[32]20.7545,[33]20.8546,[34]20.6528,[35]20.8017,[36]21.1070,[37]21.2734,[38]21.3185,[39]21.4554,[40]21.4667,[41]21.6259,[42]21.6157,[43]21.2904,[44]21.5241,[45]21.3381,[46]21.4185,[47]21.5686,[48]21.5082,[49]21.4718,[50]21.8603,[51]22.0679,[52]22.2694,[53]22.4607,[54]22.6324,[55]22.6263,[56]22.2750,[57]21.8535,[58]22.1226, | |
| Final estimate: PPL = 22.1226 +/- 0.63860 | |
| llama_perf_context_print: load time = 37772.69 ms | |
| llama_perf_context_print: prompt eval time = 447483.39 ms / 29696 tokens ( 15.07 ms per token, 66.36 tokens per second) | |
| llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) | |
| llama_perf_context_print: total time = 454384.67 ms / 29697 tokens | |
| llama_perf_context_print: graphs reused = 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment