Created
June 25, 2025 23:09
-
-
Save MeetThePatel/a190e91dd8f726670e7568fe0b181a88 to your computer and use it in GitHub Desktop.
Reference vs. Reference+AMSGrad+GradPower(p=0.9)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import sys | |
| with open(sys.argv[0]) as f: | |
| code = f.read() # read the code of this file ASAP, for logging | |
| import uuid | |
| import time | |
| import copy | |
| import glob | |
| from dataclasses import dataclass | |
| from functools import lru_cache, partial # Added partial for hook registration | |
| from pathlib import Path | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| import torch | |
| torch.empty(1, device="cuda", requires_grad=True).backward() # prevents a bug on some systems | |
| from torch import Tensor, nn | |
| import torch.nn.functional as F | |
| import torch.distributed as dist | |
| # use of FlexAttention contributed by @KoszarskyB | |
| from torch.nn.attention.flex_attention import BlockMask, flex_attention | |
| # torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min | |
| # ----------------------------------------------------------------------------- | |
| # Custom operators: FP8 matmul by @YouJiacheng | |
| @torch.library.custom_op("nanogpt::mm", mutates_args=()) | |
| def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: | |
| @torch.compile | |
| def impl(x: Tensor, w: Tensor): | |
| assert x.is_contiguous() and w.is_contiguous() | |
| x_f8 = x.div(x_s).to(torch.float8_e4m3fn) | |
| w_f8 = w.div(w_s).to(torch.float8_e4m3fn) | |
| out = torch._scaled_mm( | |
| x_f8, | |
| w_f8.T, | |
| out_dtype=torch.bfloat16, | |
| scale_a=x.new_tensor(x_s, dtype=torch.float32), | |
| scale_b=x.new_tensor(w_s, dtype=torch.float32), | |
| use_fast_accum=True, | |
| ) | |
| return out, x_f8, w_f8 | |
| return impl(x, w) | |
| @mm_op.register_fake | |
| def _(x: Tensor, w: Tensor, *_): | |
| assert x.ndim == w.ndim == 2 | |
| assert x.shape[1] == w.shape[1] | |
| assert x.device == w.device | |
| assert x.is_contiguous() and w.is_contiguous() | |
| return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) | |
| @torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) | |
| def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: | |
| @torch.compile | |
| def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): | |
| assert grad.is_contiguous() | |
| x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) | |
| w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) | |
| grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) | |
| grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) | |
| grad_x = torch._scaled_mm( | |
| grad_f8, | |
| w_f8.T.contiguous().T, | |
| out_dtype=torch.bfloat16, | |
| scale_a=grad_inv_s, | |
| scale_b=w_inv_s, | |
| use_fast_accum=False, | |
| ) | |
| # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) | |
| grad_w = torch._scaled_mm( | |
| x_f8.T.contiguous(), | |
| grad_f8.T.contiguous().T, | |
| out_dtype=torch.float32, | |
| scale_a=x_inv_s, | |
| scale_b=grad_inv_s, | |
| use_fast_accum=False, | |
| ).T | |
| return grad_x, grad_w | |
| return impl(g, x_f8, w_f8) | |
| @mm_backward_op.register_fake | |
| def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): | |
| return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) | |
| def backward(ctx, grad_out: Tensor, *_): | |
| x_f8, w_f8 = ctx.saved_tensors | |
| x_s, w_s, grad_s = ctx.scales | |
| grad_x, grad_w = torch.ops.nanogpt.mm_backward(grad_out, x_f8, w_f8, x_s, w_s, grad_s) | |
| return grad_x, grad_w, None, None, None | |
| def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): | |
| *_, x_s, w_s, grad_s = inputs | |
| _, x_f8, w_f8 = output | |
| ctx.save_for_backward(x_f8, w_f8) | |
| ctx.scales = x_s, w_s, grad_s | |
| ctx.set_materialize_grads(False) | |
| mm_op.register_autograd(backward, setup_context=setup_context) | |
| # ----------------------------------------------------------------------------- | |
| # Muon optimizer | |
| @torch.compile | |
| def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: | |
| """ | |
| Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a | |
| quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose | |
| of minimizing steps, it turns out to be empirically effective to keep increasing the slope at | |
| zero even beyond the point where the iteration no longer converges all the way to one everywhere | |
| on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T | |
| where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model | |
| performance at all relative to UV^T, where USV^T = G is the SVD. | |
| """ | |
| assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng | |
| a, b, c = (3.4445, -4.7750, 2.0315) | |
| X = G.bfloat16() | |
| if G.size(-2) > G.size(-1): | |
| X = X.mT | |
| # Ensure spectral norm is at most 1 | |
| X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7) | |
| # Perform the NS iterations | |
| for _ in range(steps): | |
| A = X @ X.mT | |
| B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng | |
| X = a * X + B @ X | |
| if G.size(-2) > G.size(-1): | |
| X = X.mT | |
| return X | |
| class Muon(torch.optim.Optimizer): | |
| """ | |
| Muon - MomentUm Orthogonalized by Newton-schulz | |
| https://kellerjordan.github.io/posts/muon/ | |
| Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- | |
| processing step, in which each 2D parameter's update is replaced with the nearest orthogonal | |
| matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has | |
| the advantage that it can be stably run in bfloat16 on the GPU. | |
| Some warnings: | |
| - This optimizer should not be used for the embedding layer, the final fully connected layer, | |
| or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). | |
| - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. | |
| Arguments: | |
| lr: The learning rate used by the internal SGD. | |
| momentum: The momentum used by the internal SGD. | |
| nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) | |
| ns_steps: The number of Newton-Schulz iteration steps to use. | |
| """ | |
| def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5, rank=0, world_size=1): | |
| self.rank = rank | |
| self.world_size = world_size | |
| defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) | |
| params: list[Tensor] = [*params] | |
| param_groups = [] | |
| for size in {p.numel() for p in params}: | |
| b = torch.empty(world_size, size, dtype=torch.bfloat16, device="cuda") | |
| group = dict(params=[p for p in params if p.numel() == size], update_buffer=b, update_buffer_views=[b[i] for i in range(world_size)]) | |
| param_groups.append(group) | |
| super().__init__(param_groups, defaults) | |
| @torch.no_grad() | |
| def step(self): | |
| for group in self.param_groups: | |
| update_buffer: Tensor = group["update_buffer"] | |
| update_buffer_views: list[Tensor] = group["update_buffer_views"] | |
| # generate weight updates in distributed fashion | |
| params: list[Tensor] = group["params"] | |
| handle = None | |
| params_world = None | |
| def update_prev(): # optimized Muon implementation contributed by @YouJiacheng | |
| handle.wait() | |
| for p_world, g_world in zip(params_world, update_buffer_views): | |
| p_world.add_(g_world.view_as(p_world), alpha=-group["lr"] * max(1, p_world.size(-2) / p_world.size(-1)) ** 0.5) | |
| for base_i in range(len(params))[:: self.world_size]: | |
| if base_i + self.rank < len(params): | |
| p = params[base_i + self.rank] | |
| g = p.grad | |
| assert g is not None | |
| state = self.state[p] | |
| if "momentum_buffer" not in state: | |
| state["momentum_buffer"] = torch.zeros_like(g) | |
| buf: Tensor = state["momentum_buffer"] | |
| buf.lerp_(g, 1 - group["momentum"]) | |
| g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf | |
| g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"]).flatten() | |
| else: | |
| g = update_buffer_views[self.rank] | |
| if base_i > 0: | |
| update_prev() # async all_gather instead of sync all_reduce by @YouJiacheng | |
| handle = dist.all_gather_into_tensor(update_buffer, g, async_op=True) | |
| params_world = params[base_i : base_i + self.world_size] | |
| update_prev() | |
| # ----------------------------------------------------------------------------- | |
| # PyTorch nn.Module definitions for the model | |
| def norm(x: Tensor): | |
| return F.rms_norm(x, (x.size(-1),)) | |
| class CastedLinear(nn.Linear): | |
| def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): | |
| super().__init__(in_features, out_features, bias=False) | |
| self.use_fp8 = use_fp8 | |
| self.x_s = x_s | |
| self.w_s = w_s | |
| self.grad_s = grad_s | |
| def reset_parameters(self) -> None: | |
| std = 0.5 * (self.in_features**-0.5) # 0.5 is a bit better than the default 1/sqrt(3) | |
| bound = (3**0.5) * std | |
| with torch.no_grad(): | |
| self.weight.uniform_(-bound, bound) | |
| def forward(self, x: Tensor): | |
| if self.use_fp8 and self.training: | |
| _x = x.flatten(0, -2) | |
| out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] | |
| return out.reshape(*x.shape[:-1], -1) | |
| else: | |
| return F.linear(x, self.weight.type_as(x)) | |
| class Rotary(nn.Module): | |
| def __init__(self, dim: int, max_seq_len: int): | |
| super().__init__() | |
| # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) | |
| angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim // 4, dtype=torch.float32) | |
| angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim // 4)]) | |
| t = torch.arange(max_seq_len, dtype=torch.float32) | |
| theta = torch.einsum("i,j -> ij", t, angular_freq) | |
| self.cos = nn.Buffer(theta.cos(), persistent=False) | |
| self.sin = nn.Buffer(theta.sin(), persistent=False) | |
| def forward(self, x_BTHD: Tensor): | |
| assert self.cos.size(0) >= x_BTHD.size(-3) | |
| cos, sin = self.cos[None, : x_BTHD.size(-3), None, :], self.sin[None, : x_BTHD.size(-3), None, :] | |
| x1, x2 = x_BTHD.to(dtype=torch.float32).chunk(2, dim=-1) | |
| y1 = x1 * cos + x2 * sin | |
| y2 = x1 * (-sin) + x2 * cos | |
| return torch.cat((y1, y2), 3).type_as(x_BTHD) | |
| class CausalSelfAttention(nn.Module): | |
| def __init__(self, dim: int, num_heads: int, max_seq_len: int, head_dim=128): | |
| super().__init__() | |
| self.num_heads = num_heads | |
| self.head_dim = head_dim | |
| hdim = num_heads * head_dim | |
| std = 0.5 * (dim**-0.5) | |
| bound = (3**0.5) * std # improved init scale by @YouJiacheng | |
| # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng | |
| # https://x.com/hi_tysam/status/1879699187107033311 | |
| self.qkv_w = nn.Parameter(torch.empty(3, hdim, dim).uniform_(-bound, bound)) | |
| self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) | |
| self.rotary = Rotary(head_dim, max_seq_len) | |
| self.c_proj = CastedLinear(hdim, dim) | |
| self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
| def forward(self, x: Tensor, ve: Tensor | None, block_mask: BlockMask): | |
| B, T = x.size(0), x.size(1) # batch size, sequence length | |
| assert B == 1, "Must use batch size = 1 for FlexAttention" | |
| q, k, v = F.linear(x, self.qkv_w.flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) | |
| q, k = norm(q), norm(k) # QK norm @Grad62304977 | |
| q, k = self.rotary(q), self.rotary(k) | |
| if ve is not None: | |
| v = self.lambdas[0] * v + self.lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977 | |
| else: # skip mid-layers token value embeddings by @YouJiacheng | |
| v = self.lambdas[0] * v | |
| # scale the attention logits by given constant, instead of the default head_dim**-0.5, by @leloykun | |
| # inspired by learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 | |
| y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, scale=0.12).transpose(1, 2) | |
| y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side | |
| y = self.c_proj(y) | |
| return y | |
| class MLP(nn.Module): | |
| def __init__(self, dim: int): | |
| super().__init__() | |
| hdim = 4 * dim | |
| self.c_fc = CastedLinear(dim, hdim) | |
| self.c_proj = CastedLinear(hdim, dim) | |
| self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
| def forward(self, x: Tensor): | |
| x = self.c_fc(x) | |
| x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 | |
| x = self.c_proj(x) | |
| return x | |
| class Block(nn.Module): | |
| def __init__(self, dim: int, num_heads: int, max_seq_len: int, layer_idx: int): | |
| super().__init__() | |
| # skip attention of blocks.7 (the 8th layer) by @YouJiacheng | |
| self.attn = CausalSelfAttention(dim, num_heads, max_seq_len) if layer_idx != 7 else None | |
| self.mlp = MLP(dim) | |
| self.lambdas = nn.Parameter(torch.tensor([1.0, 0.0])) | |
| def forward(self, x: Tensor, ve: Tensor | None, x0: Tensor, block_mask: BlockMask): | |
| x = self.lambdas[0] * x + self.lambdas[1] * x0 | |
| if self.attn is not None: | |
| x = x + self.attn(norm(x), ve, block_mask) | |
| x = x + self.mlp(norm(x)) | |
| return x | |
| # ----------------------------------------------------------------------------- | |
| # The main model | |
| def next_multiple_of_n(v: float | int, *, n: int): | |
| return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) | |
| class GPT(nn.Module): | |
| def __init__(self, vocab_size: int, num_layers: int, num_heads: int, model_dim: int, max_seq_len: int): | |
| super().__init__() | |
| self.embed = nn.Embedding(vocab_size, model_dim) | |
| # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 | |
| # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 | |
| self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) | |
| self.blocks = nn.ModuleList([Block(model_dim, num_heads, max_seq_len, i) for i in range(num_layers)]) | |
| # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. | |
| # suggested to me by @Grad62304977. this originates from Karpathy's experiments. | |
| self.lm_head = CastedLinear( | |
| model_dim, next_multiple_of_n(vocab_size, n=128), use_fp8=True, x_s=(model_dim**0.5) / 448, w_s=24 / 448, grad_s=1 / 448 | |
| ) | |
| self.lm_head.weight.detach().zero_() # @Grad62304977 | |
| # Add learnable skip connection weights for decoder layers | |
| assert num_layers % 2 == 0 | |
| self.skip_weights = nn.Parameter(torch.ones(num_layers // 2)) | |
| def create_blockmasks(self, input_seq: Tensor, sliding_window_num_blocks: Tensor): | |
| BLOCK_SIZE = 128 | |
| docs = (input_seq == 50256).cumsum(0) | |
| def document_causal(b, h, q_idx, kv_idx): | |
| causal_mask = q_idx >= kv_idx | |
| document_mask = docs[q_idx] == docs[kv_idx] | |
| return causal_mask & document_mask | |
| def dense_to_ordered(dense_blockmask: Tensor): | |
| num_blocks = dense_blockmask.sum(dim=-1, dtype=torch.int32) | |
| indices = dense_blockmask.argsort(dim=-1, descending=False, stable=True).flip(-1).to(torch.int32) | |
| return num_blocks[None, None].contiguous(), indices[None, None].contiguous() | |
| # manual block mask creation by @YouJiacheng | |
| assert len(input_seq) % BLOCK_SIZE == 0 | |
| NUM_BLOCKS = len(input_seq) // BLOCK_SIZE | |
| block_idx = torch.arange(NUM_BLOCKS, dtype=torch.int32, device="cuda") | |
| causal_blockmask_any = block_idx[:, None] >= block_idx | |
| causal_blockmask_all = block_idx[:, None] > block_idx | |
| docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() | |
| docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() | |
| document_blockmask_any = (docs_low[:, None] <= docs_high) & (docs_high[:, None] >= docs_low) | |
| document_blockmask_all = (docs_low[:, None] == docs_high) & (docs_high[:, None] == docs_low) | |
| blockmask_any = causal_blockmask_any & document_blockmask_any | |
| blockmask_all = causal_blockmask_all & document_blockmask_all | |
| partial_kv_num_blocks, partial_kv_indices = dense_to_ordered(blockmask_any & ~blockmask_all) | |
| full_kv_num_blocks, full_kv_indices = dense_to_ordered(blockmask_all) | |
| def build_bm(window_size_blocks: Tensor) -> BlockMask: | |
| return BlockMask.from_kv_blocks( | |
| torch.clamp_max(partial_kv_num_blocks, torch.clamp_min(window_size_blocks - full_kv_num_blocks, 1)), | |
| partial_kv_indices, | |
| torch.clamp_max(full_kv_num_blocks, window_size_blocks - 1), | |
| full_kv_indices, | |
| BLOCK_SIZE=BLOCK_SIZE, | |
| mask_mod=document_causal, | |
| ) | |
| # Long-short SWA block masks by @leloykun & @YouJiacheng, adapated from suggestion by @Grad62304977, following Gemma 2 paper | |
| return build_bm(sliding_window_num_blocks), build_bm(sliding_window_num_blocks // 2) | |
| def forward(self, input_seq: Tensor, target_seq: Tensor, sliding_window_num_blocks: Tensor): | |
| assert input_seq.ndim == 1 | |
| ve = [value_embed(input_seq) for value_embed in self.value_embeds] | |
| # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure | |
| ve = [ve[0], ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] | |
| assert len(ve) == len(self.blocks) | |
| long_bm, short_bm = self.create_blockmasks(input_seq, sliding_window_num_blocks) | |
| block_masks = [long_bm, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, long_bm, short_bm, short_bm, short_bm, long_bm] | |
| assert len(block_masks) == len(self.blocks) | |
| x = x0 = norm(self.embed(input_seq)[None]) # use of norm here by @Grad62304977 | |
| # U-net design by @brendanh0gan | |
| skip_connections = [] | |
| n = len(self.skip_weights) | |
| for i in range(len(self.blocks)): | |
| if i >= n: | |
| x = x + self.skip_weights[i - n] * skip_connections.pop() | |
| x = self.blocks[i](x, ve[i], x0, block_masks[i]) | |
| if i < n: | |
| skip_connections.append(x) | |
| x = norm(x) | |
| logits = self.lm_head(x).float() | |
| # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) | |
| logits = 30 * torch.sigmoid(logits / (7.5 * x.size(-1) ** 0.5)) | |
| loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_seq, reduction="sum" if self.training else "mean") | |
| return loss | |
| # ----------------------------------------------------------------------------- | |
| # Our own simple Distributed Data Loader | |
| def _load_data_shard(file: Path): | |
| header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 | |
| assert header[0] == 20240520, "magic number mismatch in the data .bin file" | |
| assert header[1] == 1, "unsupported version" | |
| num_tokens = int(header[2]) # number of tokens (claimed) | |
| with file.open("rb", buffering=0) as f: | |
| tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng | |
| f.seek(256 * 4) | |
| nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng | |
| assert nbytes == 2 * num_tokens, "number of tokens read does not match header" | |
| return tokens | |
| def distributed_data_generator(filename_pattern: str, batch_size: int, rank: int, world_size: int): | |
| files = [Path(file) for file in sorted(glob.glob(filename_pattern))] | |
| assert batch_size % world_size == 0 | |
| local_batch_size = batch_size // world_size | |
| file_iter = iter(files) # use itertools.cycle(files) instead if you want to do multi-epoch training | |
| tokens, pos = _load_data_shard(next(file_iter)), 0 | |
| while True: | |
| if pos + batch_size + 1 >= len(tokens): | |
| tokens, pos = _load_data_shard(next(file_iter)), 0 | |
| buf = tokens[pos + rank * local_batch_size :][: local_batch_size + 1] | |
| inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # no sync on host side; | |
| targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # H2D in another stream isn't helpful. | |
| pos += batch_size | |
| yield inputs, targets | |
| # ----------------------------------------------------------------------------- | |
| # int main | |
| @dataclass | |
| class Hyperparameters: | |
| # data | |
| train_files = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on | |
| val_files = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on | |
| val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons | |
| train_seq_len = 48 * 1024 # FlexAttention sequence length | |
| val_seq_len = 4 * 64 * 1024 # FlexAttention sequence length for validation | |
| # optimization | |
| num_iterations = 1770 # number of iterations to run | |
| cooldown_frac = 0.4 # fraction of training spent cooling down the learning rate | |
| # architecture | |
| vocab_size = 50257 | |
| # evaluation and logging | |
| val_loss_every = 125 # every how many steps to evaluate val loss? 0 for only at the end | |
| save_checkpoint = False | |
| args = Hyperparameters() | |
| # torchrun sets these env variables | |
| rank = int(os.environ["RANK"]) | |
| world_size = int(os.environ["WORLD_SIZE"]) | |
| assert world_size == 1 # this code is designed for 8xH100 | |
| assert torch.cuda.is_available() | |
| device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) | |
| torch.cuda.set_device(device) | |
| dist.init_process_group(backend="nccl", device_id=device) | |
| dist.barrier() | |
| master_process = rank == 0 # this process will do logging, checkpointing etc. | |
| # begin logging | |
| logfile = None | |
| if master_process: | |
| run_id = uuid.uuid4() | |
| os.makedirs("logs", exist_ok=True) | |
| logfile = f"logs/{run_id}.txt" | |
| print(logfile) | |
| def print0(s, console=False): | |
| if master_process: | |
| with open(logfile, "a") as f: | |
| if console: | |
| print(s) | |
| print(s, file=f) | |
| # begin by printing this file (the Python code) | |
| print0(code) | |
| print0("=" * 100) | |
| # log information about the hardware/software environment this is running on | |
| print0(f"Running Python {sys.version}") | |
| print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") | |
| def nvidia_smi(): | |
| import subprocess # avoid top level import | |
| return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout | |
| print0(nvidia_smi()) | |
| print0("=" * 100) | |
| ######################################## | |
| # Construct model and optimizer # | |
| ######################################## | |
| model: nn.Module = GPT( | |
| vocab_size=args.vocab_size, num_layers=12, num_heads=6, model_dim=768, max_seq_len=max(args.train_seq_len, args.val_seq_len) | |
| ).cuda() | |
| for m in model.modules(): | |
| if isinstance(m, nn.Embedding): | |
| m.bfloat16() | |
| for param in model.parameters(): | |
| dist.broadcast(param.detach(), 0) | |
| # collect the parameters to optimize | |
| hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n] | |
| embed_params = [p for n, p in model.named_parameters() if "embed" in n] | |
| scalar_params = [p for p in model.parameters() if p.ndim < 2] | |
| head_params = [model.lm_head.weight] | |
| # init the optimizer(s) | |
| adam_params = [dict(params=head_params, lr=0.22), dict(params=embed_params, lr=0.65), dict(params=scalar_params, lr=0.04)] | |
| # small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence | |
| # discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 | |
| optimizer1 = torch.optim.Adam(adam_params, betas=(0.8, 0.95), amsgrad=True, eps=1e-10, fused=True) | |
| optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95, rank=rank, world_size=world_size) | |
| optimizers = [optimizer1, optimizer2] | |
| for opt in optimizers: | |
| for group in opt.param_groups: | |
| group["initial_lr"] = group["lr"] | |
| # learning rate schedule: stable then decay | |
| def get_lr(step: int): | |
| x = step / args.num_iterations # progress in training | |
| assert 0 <= x < 1 | |
| if x < 1 - args.cooldown_frac: | |
| return 1.0 | |
| else: | |
| w = (1 - x) / args.cooldown_frac | |
| return w * 1.0 + (1 - w) * 0.1 | |
| # attention window size schedule: linearly increase | |
| @lru_cache(1) | |
| def get_window_size_blocks_helper(window_size: int): | |
| return torch.tensor(window_size // 128, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True) | |
| def get_window_size_blocks(step: int): | |
| x = step / args.num_iterations # progress in training | |
| assert 0 <= x <= 1 | |
| # Linearly increase the block-wise sliding window size over training 128 -> 1792 | |
| # increase by @fernbear.bsky.social; block-wise by @YouJiacheng | |
| window_size = next_multiple_of_n(1728 * x, n=128) | |
| return get_window_size_blocks_helper(window_size) | |
| model: nn.Module = torch.compile(model, dynamic=False) | |
| ######################################## | |
| # Warmup kernels # | |
| ######################################## | |
| # Warmup the training kernels, then re-initialize the state so we aren't cheating | |
| warmup_steps = 10 | |
| initial_state = dict( | |
| model=copy.deepcopy(model.state_dict()), optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers] | |
| ) # save the initial state | |
| for _ in range(warmup_steps): | |
| inputs = targets = torch.randint(0, args.vocab_size, size=(args.train_seq_len,), device="cuda") | |
| model(inputs.to(torch.int32), targets, get_window_size_blocks(0)).backward() | |
| for opt in optimizers: | |
| opt.step() | |
| model.zero_grad(set_to_none=True) | |
| model.load_state_dict(initial_state["model"]) | |
| for opt, opt_state in zip(optimizers, initial_state["optimizers"]): | |
| opt.load_state_dict(opt_state) | |
| del initial_state | |
| ######################################## | |
| # Overlap Communication Setup # | |
| ######################################## | |
| # Create parameter buckets for better overlap | |
| def create_buckets(params, bucket_size_mb=25): | |
| """Group parameters into buckets of approximately bucket_size_mb MB each""" | |
| buckets = [] | |
| current_bucket = [] | |
| current_size = 0 | |
| # Sort parameters by size (largest first) for better bucketing | |
| sorted_params = sorted(params, key=lambda p: p.numel(), reverse=True) | |
| for param in sorted_params: | |
| param_size_mb = param.numel() * param.element_size() / (1024 * 1024) | |
| if current_size + param_size_mb > bucket_size_mb and current_bucket: | |
| buckets.append(current_bucket) | |
| current_bucket = [param] | |
| current_size = param_size_mb | |
| else: | |
| current_bucket.append(param) | |
| current_size += param_size_mb | |
| if current_bucket: | |
| buckets.append(current_bucket) | |
| return buckets | |
| # Create buckets for all parameters | |
| all_params = [p for p in model.parameters() if p.requires_grad] | |
| param_buckets = create_buckets(all_params) | |
| print0(f"Created {len(param_buckets)} gradient buckets") | |
| for i, bucket in enumerate(param_buckets): | |
| total_size = sum(p.numel() * p.element_size() for p in bucket) / (1024 * 1024) | |
| print0(f"Bucket {i}: {len(bucket)} params, {total_size:.1f} MB") | |
| # Bucket state tracking | |
| bucket_ready_count = [0] * len(param_buckets) | |
| bucket_handles = [None] * len(param_buckets) | |
| param_to_bucket = {} | |
| # Map each parameter to its bucket index | |
| for bucket_idx, bucket in enumerate(param_buckets): | |
| for param in bucket: | |
| param_to_bucket[param] = bucket_idx | |
| def _gradient_hook(param: Tensor): | |
| """Called when a parameter's gradient is ready""" | |
| if param.grad is None: | |
| return | |
| bucket_idx = param_to_bucket[param] | |
| bucket_ready_count[bucket_idx] += 1 | |
| # Check if all parameters in this bucket are ready | |
| if bucket_ready_count[bucket_idx] == len(param_buckets[bucket_idx]): | |
| # All-reduce this bucket | |
| bucket_grads = [p.grad for p in param_buckets[bucket_idx]] | |
| # For multi-tensor operations, we can reduce them together | |
| if len(bucket_grads) == 1: | |
| handle = dist.all_reduce(bucket_grads[0], op=dist.ReduceOp.AVG, async_op=True) | |
| else: | |
| # Use multi-tensor all-reduce for efficiency | |
| handle = dist.all_reduce_coalesced(bucket_grads, op=dist.ReduceOp.AVG, async_op=True) | |
| bucket_handles[bucket_idx] = handle | |
| # Register hooks for all parameters | |
| print0("Registering bucketed gradient hooks...") | |
| for param in all_params: | |
| param.register_post_accumulate_grad_hook(_gradient_hook) | |
| def wait_for_gradients(): | |
| """Wait for all gradient reductions to complete and reset bucket state""" | |
| for handle in bucket_handles: | |
| if handle is not None: | |
| handle.wait() | |
| # Reset state for next iteration | |
| for i in range(len(bucket_ready_count)): | |
| bucket_ready_count[i] = 0 | |
| bucket_handles[i] = None | |
| ######################################## | |
| # Training and validation # | |
| ######################################## | |
| train_loader = distributed_data_generator(args.train_files, world_size * args.train_seq_len, rank, world_size) | |
| training_time_ms = 0 | |
| # start the clock | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| # begin training | |
| train_steps = args.num_iterations | |
| for step in range(train_steps + 1): | |
| last_step = step == train_steps | |
| # --------------- VALIDATION SECTION ----------------- | |
| if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): | |
| # stop the clock | |
| torch.cuda.synchronize() | |
| training_time_ms += 1000 * (time.perf_counter() - t0) | |
| model.eval() | |
| val_batch_size = world_size * args.val_seq_len | |
| assert args.val_tokens % val_batch_size == 0 | |
| val_steps = args.val_tokens // val_batch_size | |
| val_loader = distributed_data_generator(args.val_files, val_batch_size, rank, world_size) | |
| val_loss = 0 | |
| with torch.no_grad(): | |
| for _ in range(val_steps): | |
| inputs, targets = next(val_loader) | |
| val_loss += model(inputs, targets, get_window_size_blocks(step)) | |
| val_loss /= val_steps | |
| del val_loader | |
| dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) | |
| print0( | |
| f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms", | |
| console=True, | |
| ) | |
| model.train() | |
| # start the clock again | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| if last_step: | |
| if master_process and args.save_checkpoint: | |
| log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) | |
| os.makedirs(f"logs/{run_id}", exist_ok=True) | |
| torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") | |
| # the last step only has the validation loop, so break to avoid training | |
| break | |
| # --------------- TRAINING SECTION ----------------- | |
| inputs, targets = next(train_loader) | |
| model(inputs, targets, get_window_size_blocks(step)).backward() | |
| # for param in model.parameters(): | |
| # dist.all_reduce(param.grad, op=dist.ReduceOp.AVG) | |
| wait_for_gradients() # does the same thing as commented two lines above, but faster | |
| # set optimization hyperparameters | |
| for opt in optimizers: | |
| for group in opt.param_groups: | |
| group["lr"] = group["initial_lr"] * get_lr(step) | |
| for group in optimizer2.param_groups: | |
| frac = min(step / 300, 1) # momentum warmup for muon | |
| group["momentum"] = (1 - frac) * 0.85 + frac * 0.95 | |
| with torch.no_grad(): | |
| for group in optimizer2.param_groups: | |
| for param in group["params"]: | |
| if param.grad is not None: | |
| param.grad.data = torch.sign(param.grad) * torch.abs(param.grad).pow(0.9) | |
| # step the optimizers | |
| for opt in optimizers: | |
| opt.step() | |
| # null the gradients | |
| model.zero_grad(set_to_none=True) | |
| # logging | |
| approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) | |
| print0( | |
| f"step:{step + 1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / (step + 1):.2f}ms", | |
| console=True, | |
| ) | |
| print0( | |
| f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " | |
| f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", | |
| console=True, | |
| ) | |
| dist.destroy_process_group() | |
| ==================================================================================================== | |
| Running Python 3.12.3 (main, Feb 4 2025, 14:48:35) [GCC 13.3.0] | |
| Running PyTorch 2.7.1+cu126 compiled for CUDA 12.6 | |
| Wed Jun 25 22:40:03 2025 | |
| +-----------------------------------------------------------------------------------------+ | |
| | NVIDIA-SMI 560.35.03 Driver Version: 560.35.03 CUDA Version: 12.6 | | |
| |-----------------------------------------+------------------------+----------------------+ | |
| | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | | MIG M. | | |
| |=========================================+========================+======================| | |
| | 0 NVIDIA H100 80GB HBM3 On | 00000000:C6:00.0 Off | 0 | | |
| | N/A 46C P0 153W / 700W | 1184MiB / 81559MiB | 3% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| +-----------------------------------------------------------------------------------------+ | |
| | Processes: | | |
| | GPU GI CI PID Type Process name GPU Memory | | |
| | ID ID Usage | | |
| |=========================================================================================| | |
| +-----------------------------------------------------------------------------------------+ | |
| ==================================================================================================== | |
| Created 22 gradient buckets | |
| Bucket 0: 1 params, 147.4 MB | |
| Bucket 1: 1 params, 73.6 MB | |
| Bucket 2: 1 params, 73.6 MB | |
| Bucket 3: 1 params, 73.6 MB | |
| Bucket 4: 1 params, 73.6 MB | |
| Bucket 5: 2 params, 18.0 MB | |
| Bucket 6: 2 params, 18.0 MB | |
| Bucket 7: 2 params, 18.0 MB | |
| Bucket 8: 2 params, 18.0 MB | |
| Bucket 9: 2 params, 18.0 MB | |
| Bucket 10: 2 params, 18.0 MB | |
| Bucket 11: 2 params, 18.0 MB | |
| Bucket 12: 2 params, 18.0 MB | |
| Bucket 13: 2 params, 18.0 MB | |
| Bucket 14: 2 params, 18.0 MB | |
| Bucket 15: 2 params, 18.0 MB | |
| Bucket 16: 3 params, 24.8 MB | |
| Bucket 17: 3 params, 20.2 MB | |
| Bucket 18: 3 params, 20.2 MB | |
| Bucket 19: 3 params, 20.2 MB | |
| Bucket 20: 9 params, 24.8 MB | |
| Bucket 21: 27 params, 6.8 MB | |
| Registering bucketed gradient hooks... | |
| step:0/1770 val_loss:10.8258 train_time:0ms step_avg:0.02ms | |
| step:1/1770 train_time:165ms step_avg:164.94ms | |
| step:2/1770 train_time:263ms step_avg:131.31ms | |
| step:3/1770 train_time:365ms step_avg:121.65ms | |
| step:4/1770 train_time:468ms step_avg:117.04ms | |
| step:5/1770 train_time:572ms step_avg:114.35ms | |
| step:6/1770 train_time:675ms step_avg:112.56ms | |
| step:7/1770 train_time:779ms step_avg:111.26ms | |
| step:8/1770 train_time:882ms step_avg:110.22ms | |
| step:9/1770 train_time:985ms step_avg:109.48ms | |
| step:10/1770 train_time:1088ms step_avg:108.77ms | |
| step:11/1770 train_time:1191ms step_avg:108.31ms | |
| step:12/1770 train_time:1295ms step_avg:107.92ms | |
| step:13/1770 train_time:1399ms step_avg:107.59ms | |
| step:14/1770 train_time:1503ms step_avg:107.39ms | |
| step:15/1770 train_time:1607ms step_avg:107.12ms | |
| step:16/1770 train_time:1710ms step_avg:106.87ms | |
| step:17/1770 train_time:1813ms step_avg:106.65ms | |
| step:18/1770 train_time:1917ms step_avg:106.51ms | |
| step:19/1770 train_time:2024ms step_avg:106.52ms | |
| step:20/1770 train_time:2126ms step_avg:106.29ms | |
| step:21/1770 train_time:2229ms step_avg:106.16ms | |
| step:22/1770 train_time:2333ms step_avg:106.03ms | |
| step:23/1770 train_time:2436ms step_avg:105.93ms | |
| step:24/1770 train_time:2540ms step_avg:105.83ms | |
| step:25/1770 train_time:2643ms step_avg:105.71ms | |
| step:26/1770 train_time:2746ms step_avg:105.62ms | |
| step:27/1770 train_time:2850ms step_avg:105.55ms | |
| step:28/1770 train_time:2953ms step_avg:105.47ms | |
| step:29/1770 train_time:3057ms step_avg:105.41ms | |
| step:30/1770 train_time:3160ms step_avg:105.35ms | |
| step:31/1770 train_time:3264ms step_avg:105.30ms | |
| step:32/1770 train_time:3367ms step_avg:105.23ms | |
| step:33/1770 train_time:3471ms step_avg:105.17ms | |
| step:34/1770 train_time:3574ms step_avg:105.11ms | |
| step:35/1770 train_time:3677ms step_avg:105.06ms | |
| step:36/1770 train_time:3781ms step_avg:105.03ms | |
| step:37/1770 train_time:3884ms step_avg:104.96ms | |
| step:38/1770 train_time:3987ms step_avg:104.91ms | |
| step:39/1770 train_time:4090ms step_avg:104.88ms | |
| step:40/1770 train_time:4194ms step_avg:104.85ms | |
| step:41/1770 train_time:4298ms step_avg:104.83ms | |
| step:42/1770 train_time:4401ms step_avg:104.79ms | |
| step:43/1770 train_time:4505ms step_avg:104.77ms | |
| step:44/1770 train_time:4608ms step_avg:104.74ms | |
| step:45/1770 train_time:4712ms step_avg:104.72ms | |
| step:46/1770 train_time:4816ms step_avg:104.71ms | |
| step:47/1770 train_time:4920ms step_avg:104.68ms | |
| step:48/1770 train_time:5024ms step_avg:104.66ms | |
| step:49/1770 train_time:5127ms step_avg:104.63ms | |
| step:50/1770 train_time:5230ms step_avg:104.60ms | |
| step:51/1770 train_time:5334ms step_avg:104.59ms | |
| step:52/1770 train_time:5438ms step_avg:104.57ms | |
| step:53/1770 train_time:5541ms step_avg:104.55ms | |
| step:54/1770 train_time:5645ms step_avg:104.54ms | |
| step:55/1770 train_time:5748ms step_avg:104.52ms | |
| step:56/1770 train_time:5852ms step_avg:104.50ms | |
| step:57/1770 train_time:5956ms step_avg:104.48ms | |
| step:58/1770 train_time:6059ms step_avg:104.47ms | |
| step:59/1770 train_time:6162ms step_avg:104.45ms | |
| step:60/1770 train_time:6266ms step_avg:104.43ms | |
| step:61/1770 train_time:6369ms step_avg:104.40ms | |
| step:62/1770 train_time:6472ms step_avg:104.38ms | |
| step:63/1770 train_time:6575ms step_avg:104.36ms | |
| step:64/1770 train_time:6679ms step_avg:104.35ms | |
| step:65/1770 train_time:6781ms step_avg:104.33ms | |
| step:66/1770 train_time:6884ms step_avg:104.30ms | |
| step:67/1770 train_time:6987ms step_avg:104.28ms | |
| step:68/1770 train_time:7090ms step_avg:104.26ms | |
| step:69/1770 train_time:7194ms step_avg:104.26ms | |
| step:70/1770 train_time:7296ms step_avg:104.23ms | |
| step:71/1770 train_time:7399ms step_avg:104.21ms | |
| step:72/1770 train_time:7503ms step_avg:104.20ms | |
| step:73/1770 train_time:7606ms step_avg:104.19ms | |
| step:74/1770 train_time:7709ms step_avg:104.17ms | |
| step:75/1770 train_time:7812ms step_avg:104.16ms | |
| step:76/1770 train_time:7916ms step_avg:104.16ms | |
| step:77/1770 train_time:8019ms step_avg:104.15ms | |
| step:78/1770 train_time:8123ms step_avg:104.14ms | |
| step:79/1770 train_time:8227ms step_avg:104.13ms | |
| step:80/1770 train_time:8331ms step_avg:104.13ms | |
| step:81/1770 train_time:8434ms step_avg:104.13ms | |
| step:82/1770 train_time:8538ms step_avg:104.12ms | |
| step:83/1770 train_time:8644ms step_avg:104.15ms | |
| step:84/1770 train_time:8746ms step_avg:104.12ms | |
| step:85/1770 train_time:8849ms step_avg:104.10ms | |
| step:86/1770 train_time:8952ms step_avg:104.10ms | |
| step:87/1770 train_time:9055ms step_avg:104.09ms | |
| step:88/1770 train_time:9159ms step_avg:104.08ms | |
| step:89/1770 train_time:9262ms step_avg:104.07ms | |
| step:90/1770 train_time:9365ms step_avg:104.06ms | |
| step:91/1770 train_time:9469ms step_avg:104.06ms | |
| step:92/1770 train_time:9572ms step_avg:104.05ms | |
| step:93/1770 train_time:9678ms step_avg:104.06ms | |
| step:94/1770 train_time:9781ms step_avg:104.05ms | |
| step:95/1770 train_time:9882ms step_avg:104.02ms | |
| step:96/1770 train_time:9985ms step_avg:104.01ms | |
| step:97/1770 train_time:10089ms step_avg:104.01ms | |
| step:98/1770 train_time:10193ms step_avg:104.01ms | |
| step:99/1770 train_time:10296ms step_avg:104.00ms | |
| step:100/1770 train_time:10398ms step_avg:103.98ms | |
| step:101/1770 train_time:10500ms step_avg:103.96ms | |
| step:102/1770 train_time:10603ms step_avg:103.96ms | |
| step:103/1770 train_time:10708ms step_avg:103.96ms | |
| step:104/1770 train_time:10810ms step_avg:103.94ms | |
| step:105/1770 train_time:10914ms step_avg:103.94ms | |
| step:106/1770 train_time:11018ms step_avg:103.95ms | |
| step:107/1770 train_time:11122ms step_avg:103.95ms | |
| step:108/1770 train_time:11226ms step_avg:103.94ms | |
| step:109/1770 train_time:11329ms step_avg:103.94ms | |
| step:110/1770 train_time:11434ms step_avg:103.94ms | |
| step:111/1770 train_time:11537ms step_avg:103.94ms | |
| step:112/1770 train_time:11640ms step_avg:103.93ms | |
| step:113/1770 train_time:11744ms step_avg:103.93ms | |
| step:114/1770 train_time:11847ms step_avg:103.92ms | |
| step:115/1770 train_time:11950ms step_avg:103.91ms | |
| step:116/1770 train_time:12054ms step_avg:103.91ms | |
| step:117/1770 train_time:12157ms step_avg:103.90ms | |
| step:118/1770 train_time:12259ms step_avg:103.89ms | |
| step:119/1770 train_time:12362ms step_avg:103.88ms | |
| step:120/1770 train_time:12465ms step_avg:103.88ms | |
| step:121/1770 train_time:12569ms step_avg:103.87ms | |
| step:122/1770 train_time:12671ms step_avg:103.86ms | |
| step:123/1770 train_time:12774ms step_avg:103.85ms | |
| step:124/1770 train_time:12878ms step_avg:103.86ms | |
| step:125/1770 train_time:12980ms step_avg:103.84ms | |
| step:125/1770 val_loss:5.5243 train_time:12984ms step_avg:103.87ms | |
| step:126/1770 train_time:13088ms step_avg:103.87ms | |
| step:127/1770 train_time:13191ms step_avg:103.87ms | |
| step:128/1770 train_time:13295ms step_avg:103.87ms | |
| step:129/1770 train_time:13399ms step_avg:103.86ms | |
| step:130/1770 train_time:13502ms step_avg:103.86ms | |
| step:131/1770 train_time:13605ms step_avg:103.86ms | |
| step:132/1770 train_time:13709ms step_avg:103.86ms | |
| step:133/1770 train_time:13813ms step_avg:103.86ms | |
| step:134/1770 train_time:13922ms step_avg:103.89ms | |
| step:135/1770 train_time:14022ms step_avg:103.87ms | |
| step:136/1770 train_time:14126ms step_avg:103.86ms | |
| step:137/1770 train_time:14229ms step_avg:103.86ms | |
| step:138/1770 train_time:14333ms step_avg:103.86ms | |
| step:139/1770 train_time:14437ms step_avg:103.86ms | |
| step:140/1770 train_time:14541ms step_avg:103.86ms | |
| step:141/1770 train_time:14644ms step_avg:103.86ms | |
| step:142/1770 train_time:14749ms step_avg:103.86ms | |
| step:143/1770 train_time:14852ms step_avg:103.86ms | |
| step:144/1770 train_time:14956ms step_avg:103.86ms | |
| step:145/1770 train_time:15060ms step_avg:103.86ms | |
| step:146/1770 train_time:15163ms step_avg:103.86ms | |
| step:147/1770 train_time:15267ms step_avg:103.86ms | |
| step:148/1770 train_time:15371ms step_avg:103.86ms | |
| step:149/1770 train_time:15478ms step_avg:103.88ms | |
| step:150/1770 train_time:15579ms step_avg:103.86ms | |
| step:151/1770 train_time:15683ms step_avg:103.86ms | |
| step:152/1770 train_time:15788ms step_avg:103.87ms | |
| step:153/1770 train_time:15892ms step_avg:103.87ms | |
| step:154/1770 train_time:15996ms step_avg:103.87ms | |
| step:155/1770 train_time:16100ms step_avg:103.87ms | |
| step:156/1770 train_time:16204ms step_avg:103.87ms | |
| step:157/1770 train_time:16307ms step_avg:103.87ms | |
| step:158/1770 train_time:16411ms step_avg:103.87ms | |
| step:159/1770 train_time:16514ms step_avg:103.86ms | |
| step:160/1770 train_time:16617ms step_avg:103.86ms | |
| step:161/1770 train_time:16721ms step_avg:103.86ms | |
| step:162/1770 train_time:16824ms step_avg:103.85ms | |
| step:163/1770 train_time:16928ms step_avg:103.85ms | |
| step:164/1770 train_time:17031ms step_avg:103.85ms | |
| step:165/1770 train_time:17135ms step_avg:103.85ms | |
| step:166/1770 train_time:17240ms step_avg:103.85ms | |
| step:167/1770 train_time:17343ms step_avg:103.85ms | |
| step:168/1770 train_time:17447ms step_avg:103.85ms | |
| step:169/1770 train_time:17551ms step_avg:103.85ms | |
| step:170/1770 train_time:17654ms step_avg:103.85ms | |
| step:171/1770 train_time:17758ms step_avg:103.85ms | |
| step:172/1770 train_time:17862ms step_avg:103.85ms | |
| step:173/1770 train_time:17966ms step_avg:103.85ms | |
| step:174/1770 train_time:18070ms step_avg:103.85ms | |
| step:175/1770 train_time:18174ms step_avg:103.85ms | |
| step:176/1770 train_time:18280ms step_avg:103.86ms | |
| step:177/1770 train_time:18382ms step_avg:103.86ms | |
| step:178/1770 train_time:18487ms step_avg:103.86ms | |
| step:179/1770 train_time:18589ms step_avg:103.85ms | |
| step:180/1770 train_time:18692ms step_avg:103.85ms | |
| step:181/1770 train_time:18796ms step_avg:103.85ms | |
| step:182/1770 train_time:18900ms step_avg:103.85ms | |
| step:183/1770 train_time:19003ms step_avg:103.84ms | |
| step:184/1770 train_time:19107ms step_avg:103.84ms | |
| step:185/1770 train_time:19210ms step_avg:103.84ms | |
| step:186/1770 train_time:19313ms step_avg:103.84ms | |
| step:187/1770 train_time:19417ms step_avg:103.83ms | |
| step:188/1770 train_time:19521ms step_avg:103.83ms | |
| step:189/1770 train_time:19624ms step_avg:103.83ms | |
| step:190/1770 train_time:19727ms step_avg:103.83ms | |
| step:191/1770 train_time:19831ms step_avg:103.83ms | |
| step:192/1770 train_time:19934ms step_avg:103.82ms | |
| step:193/1770 train_time:20038ms step_avg:103.82ms | |
| step:194/1770 train_time:20142ms step_avg:103.82ms | |
| step:195/1770 train_time:20245ms step_avg:103.82ms | |
| step:196/1770 train_time:20349ms step_avg:103.82ms | |
| step:197/1770 train_time:20452ms step_avg:103.82ms | |
| step:198/1770 train_time:20556ms step_avg:103.82ms | |
| step:199/1770 train_time:20660ms step_avg:103.82ms | |
| step:200/1770 train_time:20763ms step_avg:103.82ms | |
| step:201/1770 train_time:20867ms step_avg:103.81ms | |
| step:202/1770 train_time:20970ms step_avg:103.81ms | |
| step:203/1770 train_time:21074ms step_avg:103.81ms | |
| step:204/1770 train_time:21183ms step_avg:103.84ms | |
| step:205/1770 train_time:21282ms step_avg:103.82ms | |
| step:206/1770 train_time:21387ms step_avg:103.82ms | |
| step:207/1770 train_time:21491ms step_avg:103.82ms | |
| step:208/1770 train_time:21594ms step_avg:103.82ms | |
| step:209/1770 train_time:21698ms step_avg:103.82ms | |
| step:210/1770 train_time:21802ms step_avg:103.82ms | |
| step:211/1770 train_time:21905ms step_avg:103.81ms | |
| step:212/1770 train_time:22008ms step_avg:103.81ms | |
| step:213/1770 train_time:22112ms step_avg:103.81ms | |
| step:214/1770 train_time:22215ms step_avg:103.81ms | |
| step:215/1770 train_time:22320ms step_avg:103.81ms | |
| step:216/1770 train_time:22423ms step_avg:103.81ms | |
| step:217/1770 train_time:22526ms step_avg:103.81ms | |
| step:218/1770 train_time:22628ms step_avg:103.80ms | |
| step:219/1770 train_time:22732ms step_avg:103.80ms | |
| step:220/1770 train_time:22836ms step_avg:103.80ms | |
| step:221/1770 train_time:22940ms step_avg:103.80ms | |
| step:222/1770 train_time:23044ms step_avg:103.80ms | |
| step:223/1770 train_time:23146ms step_avg:103.80ms | |
| step:224/1770 train_time:23250ms step_avg:103.80ms | |
| step:225/1770 train_time:23354ms step_avg:103.79ms | |
| step:226/1770 train_time:23457ms step_avg:103.79ms | |
| step:227/1770 train_time:23562ms step_avg:103.80ms | |
| step:228/1770 train_time:23666ms step_avg:103.80ms | |
| step:229/1770 train_time:23769ms step_avg:103.79ms | |
| step:230/1770 train_time:23877ms step_avg:103.81ms | |
| step:231/1770 train_time:23977ms step_avg:103.80ms | |
| step:232/1770 train_time:24081ms step_avg:103.80ms | |
| step:233/1770 train_time:24184ms step_avg:103.79ms | |
| step:234/1770 train_time:24288ms step_avg:103.79ms | |
| step:235/1770 train_time:24391ms step_avg:103.79ms | |
| step:236/1770 train_time:24496ms step_avg:103.80ms | |
| step:237/1770 train_time:24598ms step_avg:103.79ms | |
| step:238/1770 train_time:24702ms step_avg:103.79ms | |
| step:239/1770 train_time:24805ms step_avg:103.79ms | |
| step:240/1770 train_time:24908ms step_avg:103.78ms | |
| step:241/1770 train_time:25012ms step_avg:103.78ms | |
| step:242/1770 train_time:25115ms step_avg:103.78ms | |
| step:243/1770 train_time:25219ms step_avg:103.78ms | |
| step:244/1770 train_time:25322ms step_avg:103.78ms | |
| step:245/1770 train_time:25426ms step_avg:103.78ms | |
| step:246/1770 train_time:25529ms step_avg:103.78ms | |
| step:247/1770 train_time:25632ms step_avg:103.77ms | |
| step:248/1770 train_time:25736ms step_avg:103.77ms | |
| step:249/1770 train_time:25839ms step_avg:103.77ms | |
| step:250/1770 train_time:25942ms step_avg:103.77ms | |
| step:250/1770 val_loss:4.9427 train_time:25946ms step_avg:103.79ms | |
| step:251/1770 train_time:26052ms step_avg:103.79ms | |
| step:252/1770 train_time:26155ms step_avg:103.79ms | |
| step:253/1770 train_time:26259ms step_avg:103.79ms | |
| step:254/1770 train_time:26362ms step_avg:103.79ms | |
| step:255/1770 train_time:26466ms step_avg:103.79ms | |
| step:256/1770 train_time:26569ms step_avg:103.79ms | |
| step:257/1770 train_time:26673ms step_avg:103.79ms | |
| step:258/1770 train_time:26776ms step_avg:103.78ms | |
| step:259/1770 train_time:26880ms step_avg:103.78ms | |
| step:260/1770 train_time:26983ms step_avg:103.78ms | |
| step:261/1770 train_time:27087ms step_avg:103.78ms | |
| step:262/1770 train_time:27191ms step_avg:103.78ms | |
| step:263/1770 train_time:27295ms step_avg:103.78ms | |
| step:264/1770 train_time:27399ms step_avg:103.79ms | |
| step:265/1770 train_time:27504ms step_avg:103.79ms | |
| step:266/1770 train_time:27607ms step_avg:103.79ms | |
| step:267/1770 train_time:27712ms step_avg:103.79ms | |
| step:268/1770 train_time:27817ms step_avg:103.80ms | |
| step:269/1770 train_time:27920ms step_avg:103.79ms | |
| step:270/1770 train_time:28028ms step_avg:103.81ms | |
| step:271/1770 train_time:28131ms step_avg:103.81ms | |
| step:272/1770 train_time:28233ms step_avg:103.80ms | |
| step:273/1770 train_time:28337ms step_avg:103.80ms | |
| step:274/1770 train_time:28441ms step_avg:103.80ms | |
| step:275/1770 train_time:28545ms step_avg:103.80ms | |
| step:276/1770 train_time:28649ms step_avg:103.80ms | |
| step:277/1770 train_time:28753ms step_avg:103.80ms | |
| step:278/1770 train_time:28856ms step_avg:103.80ms | |
| step:279/1770 train_time:28961ms step_avg:103.80ms | |
| step:280/1770 train_time:29065ms step_avg:103.80ms | |
| step:281/1770 train_time:29169ms step_avg:103.80ms | |
| step:282/1770 train_time:29272ms step_avg:103.80ms | |
| step:283/1770 train_time:29376ms step_avg:103.80ms | |
| step:284/1770 train_time:29480ms step_avg:103.80ms | |
| step:285/1770 train_time:29583ms step_avg:103.80ms | |
| step:286/1770 train_time:29688ms step_avg:103.80ms | |
| step:287/1770 train_time:29793ms step_avg:103.81ms | |
| step:288/1770 train_time:29898ms step_avg:103.81ms | |
| step:289/1770 train_time:30002ms step_avg:103.81ms | |
| step:290/1770 train_time:30106ms step_avg:103.81ms | |
| step:291/1770 train_time:30210ms step_avg:103.82ms | |
| step:292/1770 train_time:30315ms step_avg:103.82ms | |
| step:293/1770 train_time:30419ms step_avg:103.82ms | |
| step:294/1770 train_time:30523ms step_avg:103.82ms | |
| step:295/1770 train_time:30627ms step_avg:103.82ms | |
| step:296/1770 train_time:30733ms step_avg:103.83ms | |
| step:297/1770 train_time:30835ms step_avg:103.82ms | |
| step:298/1770 train_time:30939ms step_avg:103.82ms | |
| step:299/1770 train_time:31043ms step_avg:103.82ms | |
| step:300/1770 train_time:31147ms step_avg:103.82ms | |
| step:301/1770 train_time:31252ms step_avg:103.83ms | |
| step:302/1770 train_time:31356ms step_avg:103.83ms | |
| step:303/1770 train_time:31460ms step_avg:103.83ms | |
| step:304/1770 train_time:31564ms step_avg:103.83ms | |
| step:305/1770 train_time:31668ms step_avg:103.83ms | |
| step:306/1770 train_time:31773ms step_avg:103.83ms | |
| step:307/1770 train_time:31876ms step_avg:103.83ms | |
| step:308/1770 train_time:31980ms step_avg:103.83ms | |
| step:309/1770 train_time:32084ms step_avg:103.83ms | |
| step:310/1770 train_time:32188ms step_avg:103.83ms | |
| step:311/1770 train_time:32293ms step_avg:103.84ms | |
| step:312/1770 train_time:32397ms step_avg:103.84ms | |
| step:313/1770 train_time:32500ms step_avg:103.84ms | |
| step:314/1770 train_time:32604ms step_avg:103.83ms | |
| step:315/1770 train_time:32709ms step_avg:103.84ms | |
| step:316/1770 train_time:32813ms step_avg:103.84ms | |
| step:317/1770 train_time:32917ms step_avg:103.84ms | |
| step:318/1770 train_time:33022ms step_avg:103.84ms | |
| step:319/1770 train_time:33126ms step_avg:103.84ms | |
| step:320/1770 train_time:33230ms step_avg:103.85ms | |
| step:321/1770 train_time:33334ms step_avg:103.84ms | |
| step:322/1770 train_time:33438ms step_avg:103.84ms | |
| step:323/1770 train_time:33542ms step_avg:103.85ms | |
| step:324/1770 train_time:33647ms step_avg:103.85ms | |
| step:325/1770 train_time:33751ms step_avg:103.85ms | |
| step:326/1770 train_time:33855ms step_avg:103.85ms | |
| step:327/1770 train_time:33959ms step_avg:103.85ms | |
| step:328/1770 train_time:34063ms step_avg:103.85ms | |
| step:329/1770 train_time:34166ms step_avg:103.85ms | |
| step:330/1770 train_time:34271ms step_avg:103.85ms | |
| step:331/1770 train_time:34375ms step_avg:103.85ms | |
| step:332/1770 train_time:34479ms step_avg:103.85ms | |
| step:333/1770 train_time:34583ms step_avg:103.85ms | |
| step:334/1770 train_time:34687ms step_avg:103.85ms | |
| step:335/1770 train_time:34791ms step_avg:103.85ms | |
| step:336/1770 train_time:34896ms step_avg:103.86ms | |
| step:337/1770 train_time:35000ms step_avg:103.86ms | |
| step:338/1770 train_time:35103ms step_avg:103.86ms | |
| step:339/1770 train_time:35207ms step_avg:103.86ms | |
| step:340/1770 train_time:35312ms step_avg:103.86ms | |
| step:341/1770 train_time:35416ms step_avg:103.86ms | |
| step:342/1770 train_time:35520ms step_avg:103.86ms | |
| step:343/1770 train_time:35626ms step_avg:103.87ms | |
| step:344/1770 train_time:35729ms step_avg:103.86ms | |
| step:345/1770 train_time:35833ms step_avg:103.86ms | |
| step:346/1770 train_time:35936ms step_avg:103.86ms | |
| step:347/1770 train_time:36040ms step_avg:103.86ms | |
| step:348/1770 train_time:36145ms step_avg:103.86ms | |
| step:349/1770 train_time:36249ms step_avg:103.86ms | |
| step:350/1770 train_time:36354ms step_avg:103.87ms | |
| step:351/1770 train_time:36461ms step_avg:103.88ms | |
| step:352/1770 train_time:36563ms step_avg:103.87ms | |
| step:353/1770 train_time:36667ms step_avg:103.87ms | |
| step:354/1770 train_time:36774ms step_avg:103.88ms | |
| step:355/1770 train_time:36875ms step_avg:103.87ms | |
| step:356/1770 train_time:36979ms step_avg:103.87ms | |
| step:357/1770 train_time:37082ms step_avg:103.87ms | |
| step:358/1770 train_time:37186ms step_avg:103.87ms | |
| step:359/1770 train_time:37291ms step_avg:103.87ms | |
| step:360/1770 train_time:37395ms step_avg:103.88ms | |
| step:361/1770 train_time:37500ms step_avg:103.88ms | |
| step:362/1770 train_time:37604ms step_avg:103.88ms | |
| step:363/1770 train_time:37709ms step_avg:103.88ms | |
| step:364/1770 train_time:37812ms step_avg:103.88ms | |
| step:365/1770 train_time:37916ms step_avg:103.88ms | |
| step:366/1770 train_time:38020ms step_avg:103.88ms | |
| step:367/1770 train_time:38125ms step_avg:103.88ms | |
| step:368/1770 train_time:38228ms step_avg:103.88ms | |
| step:369/1770 train_time:38333ms step_avg:103.88ms | |
| step:370/1770 train_time:38438ms step_avg:103.89ms | |
| step:371/1770 train_time:38543ms step_avg:103.89ms | |
| step:372/1770 train_time:38646ms step_avg:103.89ms | |
| step:373/1770 train_time:38750ms step_avg:103.89ms | |
| step:374/1770 train_time:38855ms step_avg:103.89ms | |
| step:375/1770 train_time:38959ms step_avg:103.89ms | |
| step:375/1770 val_loss:4.6288 train_time:38963ms step_avg:103.90ms | |
| step:376/1770 train_time:39067ms step_avg:103.90ms | |
| step:377/1770 train_time:39171ms step_avg:103.90ms | |
| step:378/1770 train_time:39276ms step_avg:103.90ms | |
| step:379/1770 train_time:39380ms step_avg:103.91ms | |
| step:380/1770 train_time:39484ms step_avg:103.91ms | |
| step:381/1770 train_time:39589ms step_avg:103.91ms | |
| step:382/1770 train_time:39693ms step_avg:103.91ms | |
| step:383/1770 train_time:39797ms step_avg:103.91ms | |
| step:384/1770 train_time:39901ms step_avg:103.91ms | |
| step:385/1770 train_time:40004ms step_avg:103.91ms | |
| step:386/1770 train_time:40108ms step_avg:103.91ms | |
| step:387/1770 train_time:40212ms step_avg:103.91ms | |
| step:388/1770 train_time:40316ms step_avg:103.91ms | |
| step:389/1770 train_time:40420ms step_avg:103.91ms | |
| step:390/1770 train_time:40525ms step_avg:103.91ms | |
| step:391/1770 train_time:40629ms step_avg:103.91ms | |
| step:392/1770 train_time:40733ms step_avg:103.91ms | |
| step:393/1770 train_time:40837ms step_avg:103.91ms | |
| step:394/1770 train_time:40941ms step_avg:103.91ms | |
| step:395/1770 train_time:41048ms step_avg:103.92ms | |
| step:396/1770 train_time:41157ms step_avg:103.93ms | |
| step:397/1770 train_time:41260ms step_avg:103.93ms | |
| step:398/1770 train_time:41366ms step_avg:103.94ms | |
| step:399/1770 train_time:41473ms step_avg:103.94ms | |
| step:400/1770 train_time:41580ms step_avg:103.95ms | |
| step:401/1770 train_time:41686ms step_avg:103.95ms | |
| step:402/1770 train_time:41792ms step_avg:103.96ms | |
| step:403/1770 train_time:41901ms step_avg:103.97ms | |
| step:404/1770 train_time:42004ms step_avg:103.97ms | |
| step:405/1770 train_time:42110ms step_avg:103.98ms | |
| step:406/1770 train_time:42216ms step_avg:103.98ms | |
| step:407/1770 train_time:42323ms step_avg:103.99ms | |
| step:408/1770 train_time:42429ms step_avg:103.99ms | |
| step:409/1770 train_time:42536ms step_avg:104.00ms | |
| step:410/1770 train_time:42642ms step_avg:104.01ms | |
| step:411/1770 train_time:42749ms step_avg:104.01ms | |
| step:412/1770 train_time:42855ms step_avg:104.02ms | |
| step:413/1770 train_time:42961ms step_avg:104.02ms | |
| step:414/1770 train_time:43074ms step_avg:104.04ms | |
| step:415/1770 train_time:43182ms step_avg:104.05ms | |
| step:416/1770 train_time:43288ms step_avg:104.06ms | |
| step:417/1770 train_time:43395ms step_avg:104.06ms | |
| step:418/1770 train_time:43502ms step_avg:104.07ms | |
| step:419/1770 train_time:43605ms step_avg:104.07ms | |
| step:420/1770 train_time:43712ms step_avg:104.08ms | |
| step:421/1770 train_time:43817ms step_avg:104.08ms | |
| step:422/1770 train_time:43928ms step_avg:104.10ms | |
| step:423/1770 train_time:44034ms step_avg:104.10ms | |
| step:424/1770 train_time:44143ms step_avg:104.11ms | |
| step:425/1770 train_time:44247ms step_avg:104.11ms | |
| step:426/1770 train_time:44357ms step_avg:104.12ms | |
| step:427/1770 train_time:44461ms step_avg:104.12ms | |
| step:428/1770 train_time:44567ms step_avg:104.13ms | |
| step:429/1770 train_time:44673ms step_avg:104.13ms | |
| step:430/1770 train_time:44780ms step_avg:104.14ms | |
| step:431/1770 train_time:44885ms step_avg:104.14ms | |
| step:432/1770 train_time:44991ms step_avg:104.15ms | |
| step:433/1770 train_time:45097ms step_avg:104.15ms | |
| step:434/1770 train_time:45204ms step_avg:104.16ms | |
| step:435/1770 train_time:45312ms step_avg:104.17ms | |
| step:436/1770 train_time:45419ms step_avg:104.17ms | |
| step:437/1770 train_time:45525ms step_avg:104.18ms | |
| step:438/1770 train_time:45631ms step_avg:104.18ms | |
| step:439/1770 train_time:45737ms step_avg:104.18ms | |
| step:440/1770 train_time:45843ms step_avg:104.19ms | |
| step:441/1770 train_time:45950ms step_avg:104.19ms | |
| step:442/1770 train_time:46055ms step_avg:104.20ms | |
| step:443/1770 train_time:46162ms step_avg:104.20ms | |
| step:444/1770 train_time:46268ms step_avg:104.21ms | |
| step:445/1770 train_time:46373ms step_avg:104.21ms | |
| step:446/1770 train_time:46480ms step_avg:104.21ms | |
| step:447/1770 train_time:46586ms step_avg:104.22ms | |
| step:448/1770 train_time:46691ms step_avg:104.22ms | |
| step:449/1770 train_time:46798ms step_avg:104.23ms | |
| step:450/1770 train_time:46905ms step_avg:104.23ms | |
| step:451/1770 train_time:47011ms step_avg:104.24ms | |
| step:452/1770 train_time:47117ms step_avg:104.24ms | |
| step:453/1770 train_time:47223ms step_avg:104.25ms | |
| step:454/1770 train_time:47330ms step_avg:104.25ms | |
| step:455/1770 train_time:47437ms step_avg:104.26ms | |
| step:456/1770 train_time:47544ms step_avg:104.26ms | |
| step:457/1770 train_time:47651ms step_avg:104.27ms | |
| step:458/1770 train_time:47757ms step_avg:104.27ms | |
| step:459/1770 train_time:47863ms step_avg:104.28ms | |
| step:460/1770 train_time:47970ms step_avg:104.28ms | |
| step:461/1770 train_time:48078ms step_avg:104.29ms | |
| step:462/1770 train_time:48182ms step_avg:104.29ms | |
| step:463/1770 train_time:48288ms step_avg:104.29ms | |
| step:464/1770 train_time:48396ms step_avg:104.30ms | |
| step:465/1770 train_time:48499ms step_avg:104.30ms | |
| step:466/1770 train_time:48605ms step_avg:104.30ms | |
| step:467/1770 train_time:48711ms step_avg:104.31ms | |
| step:468/1770 train_time:48817ms step_avg:104.31ms | |
| step:469/1770 train_time:48923ms step_avg:104.31ms | |
| step:470/1770 train_time:49029ms step_avg:104.32ms | |
| step:471/1770 train_time:49136ms step_avg:104.32ms | |
| step:472/1770 train_time:49242ms step_avg:104.33ms | |
| step:473/1770 train_time:49348ms step_avg:104.33ms | |
| step:474/1770 train_time:49454ms step_avg:104.33ms | |
| step:475/1770 train_time:49560ms step_avg:104.34ms | |
| step:476/1770 train_time:49670ms step_avg:104.35ms | |
| step:477/1770 train_time:49773ms step_avg:104.35ms | |
| step:478/1770 train_time:49880ms step_avg:104.35ms | |
| step:479/1770 train_time:49986ms step_avg:104.35ms | |
| step:480/1770 train_time:50093ms step_avg:104.36ms | |
| step:481/1770 train_time:50199ms step_avg:104.36ms | |
| step:482/1770 train_time:50305ms step_avg:104.37ms | |
| step:483/1770 train_time:50411ms step_avg:104.37ms | |
| step:484/1770 train_time:50517ms step_avg:104.37ms | |
| step:485/1770 train_time:50624ms step_avg:104.38ms | |
| step:486/1770 train_time:50730ms step_avg:104.38ms | |
| step:487/1770 train_time:50838ms step_avg:104.39ms | |
| step:488/1770 train_time:50942ms step_avg:104.39ms | |
| step:489/1770 train_time:51047ms step_avg:104.39ms | |
| step:490/1770 train_time:51154ms step_avg:104.40ms | |
| step:491/1770 train_time:51264ms step_avg:104.41ms | |
| step:492/1770 train_time:51367ms step_avg:104.40ms | |
| step:493/1770 train_time:51473ms step_avg:104.41ms | |
| step:494/1770 train_time:51580ms step_avg:104.41ms | |
| step:495/1770 train_time:51687ms step_avg:104.42ms | |
| step:496/1770 train_time:51793ms step_avg:104.42ms | |
| step:497/1770 train_time:51899ms step_avg:104.43ms | |
| step:498/1770 train_time:52005ms step_avg:104.43ms | |
| step:499/1770 train_time:52111ms step_avg:104.43ms | |
| step:500/1770 train_time:52218ms step_avg:104.44ms | |
| step:500/1770 val_loss:4.4272 train_time:52223ms step_avg:104.45ms | |
| step:501/1770 train_time:52332ms step_avg:104.46ms | |
| step:502/1770 train_time:52438ms step_avg:104.46ms | |
| step:503/1770 train_time:52544ms step_avg:104.46ms | |
| step:504/1770 train_time:52651ms step_avg:104.47ms | |
| step:505/1770 train_time:52757ms step_avg:104.47ms | |
| step:506/1770 train_time:52863ms step_avg:104.47ms | |
| step:507/1770 train_time:52970ms step_avg:104.48ms | |
| step:508/1770 train_time:53076ms step_avg:104.48ms | |
| step:509/1770 train_time:53182ms step_avg:104.48ms | |
| step:510/1770 train_time:53288ms step_avg:104.49ms | |
| step:511/1770 train_time:53396ms step_avg:104.49ms | |
| step:512/1770 train_time:53499ms step_avg:104.49ms | |
| step:513/1770 train_time:53605ms step_avg:104.49ms | |
| step:514/1770 train_time:53712ms step_avg:104.50ms | |
| step:515/1770 train_time:53821ms step_avg:104.51ms | |
| step:516/1770 train_time:53925ms step_avg:104.51ms | |
| step:517/1770 train_time:54031ms step_avg:104.51ms | |
| step:518/1770 train_time:54136ms step_avg:104.51ms | |
| step:519/1770 train_time:54242ms step_avg:104.51ms | |
| step:520/1770 train_time:54349ms step_avg:104.52ms | |
| step:521/1770 train_time:54455ms step_avg:104.52ms | |
| step:522/1770 train_time:54562ms step_avg:104.52ms | |
| step:523/1770 train_time:54668ms step_avg:104.53ms | |
| step:524/1770 train_time:54774ms step_avg:104.53ms | |
| step:525/1770 train_time:54880ms step_avg:104.53ms | |
| step:526/1770 train_time:54987ms step_avg:104.54ms | |
| step:527/1770 train_time:55093ms step_avg:104.54ms | |
| step:528/1770 train_time:55199ms step_avg:104.54ms | |
| step:529/1770 train_time:55306ms step_avg:104.55ms | |
| step:530/1770 train_time:55412ms step_avg:104.55ms | |
| step:531/1770 train_time:55520ms step_avg:104.56ms | |
| step:532/1770 train_time:55627ms step_avg:104.56ms | |
| step:533/1770 train_time:55734ms step_avg:104.57ms | |
| step:534/1770 train_time:55841ms step_avg:104.57ms | |
| step:535/1770 train_time:55947ms step_avg:104.57ms | |
| step:536/1770 train_time:56054ms step_avg:104.58ms | |
| step:537/1770 train_time:56161ms step_avg:104.58ms | |
| step:538/1770 train_time:56268ms step_avg:104.59ms | |
| step:539/1770 train_time:56374ms step_avg:104.59ms | |
| step:540/1770 train_time:56480ms step_avg:104.59ms | |
| step:541/1770 train_time:56586ms step_avg:104.60ms | |
| step:542/1770 train_time:56696ms step_avg:104.60ms | |
| step:543/1770 train_time:56800ms step_avg:104.60ms | |
| step:544/1770 train_time:56905ms step_avg:104.60ms | |
| step:545/1770 train_time:57011ms step_avg:104.61ms | |
| step:546/1770 train_time:57118ms step_avg:104.61ms | |
| step:547/1770 train_time:57225ms step_avg:104.62ms | |
| step:548/1770 train_time:57332ms step_avg:104.62ms | |
| step:549/1770 train_time:57438ms step_avg:104.62ms | |
| step:550/1770 train_time:57545ms step_avg:104.63ms | |
| step:551/1770 train_time:57651ms step_avg:104.63ms | |
| step:552/1770 train_time:57757ms step_avg:104.63ms | |
| step:553/1770 train_time:57864ms step_avg:104.64ms | |
| step:554/1770 train_time:57971ms step_avg:104.64ms | |
| step:555/1770 train_time:58077ms step_avg:104.64ms | |
| step:556/1770 train_time:58183ms step_avg:104.65ms | |
| step:557/1770 train_time:58290ms step_avg:104.65ms | |
| step:558/1770 train_time:58396ms step_avg:104.65ms | |
| step:559/1770 train_time:58503ms step_avg:104.66ms | |
| step:560/1770 train_time:58610ms step_avg:104.66ms | |
| step:561/1770 train_time:58716ms step_avg:104.66ms | |
| step:562/1770 train_time:58822ms step_avg:104.67ms | |
| step:563/1770 train_time:58929ms step_avg:104.67ms | |
| step:564/1770 train_time:59036ms step_avg:104.67ms | |
| step:565/1770 train_time:59143ms step_avg:104.68ms | |
| step:566/1770 train_time:59250ms step_avg:104.68ms | |
| step:567/1770 train_time:59356ms step_avg:104.68ms | |
| step:568/1770 train_time:59466ms step_avg:104.69ms | |
| step:569/1770 train_time:59576ms step_avg:104.70ms | |
| step:570/1770 train_time:59682ms step_avg:104.71ms | |
| step:571/1770 train_time:59788ms step_avg:104.71ms | |
| step:572/1770 train_time:59892ms step_avg:104.71ms | |
| step:573/1770 train_time:59998ms step_avg:104.71ms | |
| step:574/1770 train_time:60106ms step_avg:104.71ms | |
| step:575/1770 train_time:60212ms step_avg:104.72ms | |
| step:576/1770 train_time:60318ms step_avg:104.72ms | |
| step:577/1770 train_time:60425ms step_avg:104.72ms | |
| step:578/1770 train_time:60532ms step_avg:104.73ms | |
| step:579/1770 train_time:60638ms step_avg:104.73ms | |
| step:580/1770 train_time:60744ms step_avg:104.73ms | |
| step:581/1770 train_time:60850ms step_avg:104.73ms | |
| step:582/1770 train_time:60957ms step_avg:104.74ms | |
| step:583/1770 train_time:61064ms step_avg:104.74ms | |
| step:584/1770 train_time:61171ms step_avg:104.74ms | |
| step:585/1770 train_time:61277ms step_avg:104.75ms | |
| step:586/1770 train_time:61384ms step_avg:104.75ms | |
| step:587/1770 train_time:61490ms step_avg:104.75ms | |
| step:588/1770 train_time:61597ms step_avg:104.76ms | |
| step:589/1770 train_time:61703ms step_avg:104.76ms | |
| step:590/1770 train_time:61810ms step_avg:104.76ms | |
| step:591/1770 train_time:61916ms step_avg:104.77ms | |
| step:592/1770 train_time:62023ms step_avg:104.77ms | |
| step:593/1770 train_time:62130ms step_avg:104.77ms | |
| step:594/1770 train_time:62237ms step_avg:104.78ms | |
| step:595/1770 train_time:62343ms step_avg:104.78ms | |
| step:596/1770 train_time:62451ms step_avg:104.78ms | |
| step:597/1770 train_time:62560ms step_avg:104.79ms | |
| step:598/1770 train_time:62668ms step_avg:104.80ms | |
| step:599/1770 train_time:62771ms step_avg:104.79ms | |
| step:600/1770 train_time:62878ms step_avg:104.80ms | |
| step:601/1770 train_time:62984ms step_avg:104.80ms | |
| step:602/1770 train_time:63091ms step_avg:104.80ms | |
| step:603/1770 train_time:63196ms step_avg:104.80ms | |
| step:604/1770 train_time:63303ms step_avg:104.81ms | |
| step:605/1770 train_time:63409ms step_avg:104.81ms | |
| step:606/1770 train_time:63516ms step_avg:104.81ms | |
| step:607/1770 train_time:63622ms step_avg:104.81ms | |
| step:608/1770 train_time:63729ms step_avg:104.82ms | |
| step:609/1770 train_time:63836ms step_avg:104.82ms | |
| step:610/1770 train_time:63943ms step_avg:104.82ms | |
| step:611/1770 train_time:64050ms step_avg:104.83ms | |
| step:612/1770 train_time:64157ms step_avg:104.83ms | |
| step:613/1770 train_time:64264ms step_avg:104.83ms | |
| step:614/1770 train_time:64371ms step_avg:104.84ms | |
| step:615/1770 train_time:64477ms step_avg:104.84ms | |
| step:616/1770 train_time:64583ms step_avg:104.84ms | |
| step:617/1770 train_time:64690ms step_avg:104.85ms | |
| step:618/1770 train_time:64796ms step_avg:104.85ms | |
| step:619/1770 train_time:64902ms step_avg:104.85ms | |
| step:620/1770 train_time:65009ms step_avg:104.85ms | |
| step:621/1770 train_time:65115ms step_avg:104.85ms | |
| step:622/1770 train_time:65221ms step_avg:104.86ms | |
| step:623/1770 train_time:65328ms step_avg:104.86ms | |
| step:624/1770 train_time:65435ms step_avg:104.86ms | |
| step:625/1770 train_time:65542ms step_avg:104.87ms | |
| step:625/1770 val_loss:4.3012 train_time:65546ms step_avg:104.87ms | |
| step:626/1770 train_time:65653ms step_avg:104.88ms | |
| step:627/1770 train_time:65762ms step_avg:104.88ms | |
| step:628/1770 train_time:65869ms step_avg:104.89ms | |
| step:629/1770 train_time:65976ms step_avg:104.89ms | |
| step:630/1770 train_time:66083ms step_avg:104.89ms | |
| step:631/1770 train_time:66189ms step_avg:104.90ms | |
| step:632/1770 train_time:66295ms step_avg:104.90ms | |
| step:633/1770 train_time:66402ms step_avg:104.90ms | |
| step:634/1770 train_time:66509ms step_avg:104.90ms | |
| step:635/1770 train_time:66615ms step_avg:104.91ms | |
| step:636/1770 train_time:66723ms step_avg:104.91ms | |
| step:637/1770 train_time:66829ms step_avg:104.91ms | |
| step:638/1770 train_time:66936ms step_avg:104.92ms | |
| step:639/1770 train_time:67042ms step_avg:104.92ms | |
| step:640/1770 train_time:67148ms step_avg:104.92ms | |
| step:641/1770 train_time:67259ms step_avg:104.93ms | |
| step:642/1770 train_time:67362ms step_avg:104.92ms | |
| step:643/1770 train_time:67468ms step_avg:104.93ms | |
| step:644/1770 train_time:67575ms step_avg:104.93ms | |
| step:645/1770 train_time:67681ms step_avg:104.93ms | |
| step:646/1770 train_time:67788ms step_avg:104.93ms | |
| step:647/1770 train_time:67895ms step_avg:104.94ms | |
| step:648/1770 train_time:68001ms step_avg:104.94ms | |
| step:649/1770 train_time:68107ms step_avg:104.94ms | |
| step:650/1770 train_time:68214ms step_avg:104.95ms | |
| step:651/1770 train_time:68321ms step_avg:104.95ms | |
| step:652/1770 train_time:68429ms step_avg:104.95ms | |
| step:653/1770 train_time:68536ms step_avg:104.95ms | |
| step:654/1770 train_time:68642ms step_avg:104.96ms | |
| step:655/1770 train_time:68748ms step_avg:104.96ms | |
| step:656/1770 train_time:68855ms step_avg:104.96ms | |
| step:657/1770 train_time:68967ms step_avg:104.97ms | |
| step:658/1770 train_time:69072ms step_avg:104.97ms | |
| step:659/1770 train_time:69180ms step_avg:104.98ms | |
| step:660/1770 train_time:69288ms step_avg:104.98ms | |
| step:661/1770 train_time:69398ms step_avg:104.99ms | |
| step:662/1770 train_time:69506ms step_avg:104.99ms | |
| step:663/1770 train_time:69615ms step_avg:105.00ms | |
| step:664/1770 train_time:69724ms step_avg:105.01ms | |
| step:665/1770 train_time:69832ms step_avg:105.01ms | |
| step:666/1770 train_time:69940ms step_avg:105.02ms | |
| step:667/1770 train_time:70048ms step_avg:105.02ms | |
| step:668/1770 train_time:70157ms step_avg:105.03ms | |
| step:669/1770 train_time:70265ms step_avg:105.03ms | |
| step:670/1770 train_time:70373ms step_avg:105.03ms | |
| step:671/1770 train_time:70482ms step_avg:105.04ms | |
| step:672/1770 train_time:70590ms step_avg:105.04ms | |
| step:673/1770 train_time:70699ms step_avg:105.05ms | |
| step:674/1770 train_time:70807ms step_avg:105.05ms | |
| step:675/1770 train_time:70916ms step_avg:105.06ms | |
| step:676/1770 train_time:71024ms step_avg:105.07ms | |
| step:677/1770 train_time:71133ms step_avg:105.07ms | |
| step:678/1770 train_time:71242ms step_avg:105.08ms | |
| step:679/1770 train_time:71350ms step_avg:105.08ms | |
| step:680/1770 train_time:71462ms step_avg:105.09ms | |
| step:681/1770 train_time:71567ms step_avg:105.09ms | |
| step:682/1770 train_time:71676ms step_avg:105.10ms | |
| step:683/1770 train_time:71784ms step_avg:105.10ms | |
| step:684/1770 train_time:71892ms step_avg:105.11ms | |
| step:685/1770 train_time:72000ms step_avg:105.11ms | |
| step:686/1770 train_time:72108ms step_avg:105.11ms | |
| step:687/1770 train_time:72216ms step_avg:105.12ms | |
| step:688/1770 train_time:72325ms step_avg:105.12ms | |
| step:689/1770 train_time:72433ms step_avg:105.13ms | |
| step:690/1770 train_time:72541ms step_avg:105.13ms | |
| step:691/1770 train_time:72649ms step_avg:105.14ms | |
| step:692/1770 train_time:72758ms step_avg:105.14ms | |
| step:693/1770 train_time:72866ms step_avg:105.15ms | |
| step:694/1770 train_time:72975ms step_avg:105.15ms | |
| step:695/1770 train_time:73083ms step_avg:105.15ms | |
| step:696/1770 train_time:73191ms step_avg:105.16ms | |
| step:697/1770 train_time:73300ms step_avg:105.17ms | |
| step:698/1770 train_time:73409ms step_avg:105.17ms | |
| step:699/1770 train_time:73517ms step_avg:105.17ms | |
| step:700/1770 train_time:73625ms step_avg:105.18ms | |
| step:701/1770 train_time:73734ms step_avg:105.18ms | |
| step:702/1770 train_time:73843ms step_avg:105.19ms | |
| step:703/1770 train_time:73951ms step_avg:105.19ms | |
| step:704/1770 train_time:74059ms step_avg:105.20ms | |
| step:705/1770 train_time:74167ms step_avg:105.20ms | |
| step:706/1770 train_time:74275ms step_avg:105.21ms | |
| step:707/1770 train_time:74383ms step_avg:105.21ms | |
| step:708/1770 train_time:74491ms step_avg:105.21ms | |
| step:709/1770 train_time:74601ms step_avg:105.22ms | |
| step:710/1770 train_time:74709ms step_avg:105.22ms | |
| step:711/1770 train_time:74817ms step_avg:105.23ms | |
| step:712/1770 train_time:74926ms step_avg:105.23ms | |
| step:713/1770 train_time:75034ms step_avg:105.24ms | |
| step:714/1770 train_time:75142ms step_avg:105.24ms | |
| step:715/1770 train_time:75251ms step_avg:105.25ms | |
| step:716/1770 train_time:75360ms step_avg:105.25ms | |
| step:717/1770 train_time:75467ms step_avg:105.25ms | |
| step:718/1770 train_time:75576ms step_avg:105.26ms | |
| step:719/1770 train_time:75685ms step_avg:105.26ms | |
| step:720/1770 train_time:75792ms step_avg:105.27ms | |
| step:721/1770 train_time:75900ms step_avg:105.27ms | |
| step:722/1770 train_time:76008ms step_avg:105.27ms | |
| step:723/1770 train_time:76116ms step_avg:105.28ms | |
| step:724/1770 train_time:76225ms step_avg:105.28ms | |
| step:725/1770 train_time:76333ms step_avg:105.29ms | |
| step:726/1770 train_time:76442ms step_avg:105.29ms | |
| step:727/1770 train_time:76550ms step_avg:105.30ms | |
| step:728/1770 train_time:76658ms step_avg:105.30ms | |
| step:729/1770 train_time:76766ms step_avg:105.30ms | |
| step:730/1770 train_time:76877ms step_avg:105.31ms | |
| step:731/1770 train_time:76982ms step_avg:105.31ms | |
| step:732/1770 train_time:77091ms step_avg:105.32ms | |
| step:733/1770 train_time:77199ms step_avg:105.32ms | |
| step:734/1770 train_time:77307ms step_avg:105.32ms | |
| step:735/1770 train_time:77420ms step_avg:105.33ms | |
| step:736/1770 train_time:77524ms step_avg:105.33ms | |
| step:737/1770 train_time:77633ms step_avg:105.34ms | |
| step:738/1770 train_time:77742ms step_avg:105.34ms | |
| step:739/1770 train_time:77849ms step_avg:105.34ms | |
| step:740/1770 train_time:77958ms step_avg:105.35ms | |
| step:741/1770 train_time:78065ms step_avg:105.35ms | |
| step:742/1770 train_time:78173ms step_avg:105.35ms | |
| step:743/1770 train_time:78282ms step_avg:105.36ms | |
| step:744/1770 train_time:78390ms step_avg:105.36ms | |
| step:745/1770 train_time:78498ms step_avg:105.37ms | |
| step:746/1770 train_time:78607ms step_avg:105.37ms | |
| step:747/1770 train_time:78716ms step_avg:105.38ms | |
| step:748/1770 train_time:78825ms step_avg:105.38ms | |
| step:749/1770 train_time:78934ms step_avg:105.39ms | |
| step:750/1770 train_time:79041ms step_avg:105.39ms | |
| step:750/1770 val_loss:4.2175 train_time:79046ms step_avg:105.39ms | |
| step:751/1770 train_time:79155ms step_avg:105.40ms | |
| step:752/1770 train_time:79264ms step_avg:105.40ms | |
| step:753/1770 train_time:79374ms step_avg:105.41ms | |
| step:754/1770 train_time:79483ms step_avg:105.42ms | |
| step:755/1770 train_time:79592ms step_avg:105.42ms | |
| step:756/1770 train_time:79702ms step_avg:105.43ms | |
| step:757/1770 train_time:79809ms step_avg:105.43ms | |
| step:758/1770 train_time:79919ms step_avg:105.43ms | |
| step:759/1770 train_time:80027ms step_avg:105.44ms | |
| step:760/1770 train_time:80136ms step_avg:105.44ms | |
| step:761/1770 train_time:80243ms step_avg:105.44ms | |
| step:762/1770 train_time:80351ms step_avg:105.45ms | |
| step:763/1770 train_time:80460ms step_avg:105.45ms | |
| step:764/1770 train_time:80569ms step_avg:105.46ms | |
| step:765/1770 train_time:80677ms step_avg:105.46ms | |
| step:766/1770 train_time:80786ms step_avg:105.46ms | |
| step:767/1770 train_time:80895ms step_avg:105.47ms | |
| step:768/1770 train_time:81003ms step_avg:105.47ms | |
| step:769/1770 train_time:81111ms step_avg:105.48ms | |
| step:770/1770 train_time:81219ms step_avg:105.48ms | |
| step:771/1770 train_time:81327ms step_avg:105.48ms | |
| step:772/1770 train_time:81435ms step_avg:105.49ms | |
| step:773/1770 train_time:81543ms step_avg:105.49ms | |
| step:774/1770 train_time:81651ms step_avg:105.49ms | |
| step:775/1770 train_time:81764ms step_avg:105.50ms | |
| step:776/1770 train_time:81869ms step_avg:105.50ms | |
| step:777/1770 train_time:81977ms step_avg:105.51ms | |
| step:778/1770 train_time:82085ms step_avg:105.51ms | |
| step:779/1770 train_time:82194ms step_avg:105.51ms | |
| step:780/1770 train_time:82303ms step_avg:105.52ms | |
| step:781/1770 train_time:82410ms step_avg:105.52ms | |
| step:782/1770 train_time:82519ms step_avg:105.52ms | |
| step:783/1770 train_time:82627ms step_avg:105.53ms | |
| step:784/1770 train_time:82735ms step_avg:105.53ms | |
| step:785/1770 train_time:82843ms step_avg:105.53ms | |
| step:786/1770 train_time:82952ms step_avg:105.54ms | |
| step:787/1770 train_time:83060ms step_avg:105.54ms | |
| step:788/1770 train_time:83169ms step_avg:105.54ms | |
| step:789/1770 train_time:83278ms step_avg:105.55ms | |
| step:790/1770 train_time:83387ms step_avg:105.55ms | |
| step:791/1770 train_time:83495ms step_avg:105.56ms | |
| step:792/1770 train_time:83604ms step_avg:105.56ms | |
| step:793/1770 train_time:83712ms step_avg:105.56ms | |
| step:794/1770 train_time:83821ms step_avg:105.57ms | |
| step:795/1770 train_time:83929ms step_avg:105.57ms | |
| step:796/1770 train_time:84037ms step_avg:105.57ms | |
| step:797/1770 train_time:84146ms step_avg:105.58ms | |
| step:798/1770 train_time:84255ms step_avg:105.58ms | |
| step:799/1770 train_time:84363ms step_avg:105.59ms | |
| step:800/1770 train_time:84472ms step_avg:105.59ms | |
| step:801/1770 train_time:84581ms step_avg:105.59ms | |
| step:802/1770 train_time:84689ms step_avg:105.60ms | |
| step:803/1770 train_time:84799ms step_avg:105.60ms | |
| step:804/1770 train_time:84908ms step_avg:105.61ms | |
| step:805/1770 train_time:85016ms step_avg:105.61ms | |
| step:806/1770 train_time:85126ms step_avg:105.62ms | |
| step:807/1770 train_time:85235ms step_avg:105.62ms | |
| step:808/1770 train_time:85343ms step_avg:105.62ms | |
| step:809/1770 train_time:85452ms step_avg:105.63ms | |
| step:810/1770 train_time:85561ms step_avg:105.63ms | |
| step:811/1770 train_time:85671ms step_avg:105.64ms | |
| step:812/1770 train_time:85778ms step_avg:105.64ms | |
| step:813/1770 train_time:85887ms step_avg:105.64ms | |
| step:814/1770 train_time:85995ms step_avg:105.65ms | |
| step:815/1770 train_time:86104ms step_avg:105.65ms | |
| step:816/1770 train_time:86212ms step_avg:105.65ms | |
| step:817/1770 train_time:86321ms step_avg:105.66ms | |
| step:818/1770 train_time:86429ms step_avg:105.66ms | |
| step:819/1770 train_time:86538ms step_avg:105.66ms | |
| step:820/1770 train_time:86647ms step_avg:105.67ms | |
| step:821/1770 train_time:86755ms step_avg:105.67ms | |
| step:822/1770 train_time:86864ms step_avg:105.67ms | |
| step:823/1770 train_time:86973ms step_avg:105.68ms | |
| step:824/1770 train_time:87082ms step_avg:105.68ms | |
| step:825/1770 train_time:87190ms step_avg:105.68ms | |
| step:826/1770 train_time:87299ms step_avg:105.69ms | |
| step:827/1770 train_time:87407ms step_avg:105.69ms | |
| step:828/1770 train_time:87516ms step_avg:105.70ms | |
| step:829/1770 train_time:87625ms step_avg:105.70ms | |
| step:830/1770 train_time:87734ms step_avg:105.70ms | |
| step:831/1770 train_time:87843ms step_avg:105.71ms | |
| step:832/1770 train_time:87951ms step_avg:105.71ms | |
| step:833/1770 train_time:88060ms step_avg:105.71ms | |
| step:834/1770 train_time:88171ms step_avg:105.72ms | |
| step:835/1770 train_time:88277ms step_avg:105.72ms | |
| step:836/1770 train_time:88386ms step_avg:105.72ms | |
| step:837/1770 train_time:88495ms step_avg:105.73ms | |
| step:838/1770 train_time:88603ms step_avg:105.73ms | |
| step:839/1770 train_time:88712ms step_avg:105.74ms | |
| step:840/1770 train_time:88821ms step_avg:105.74ms | |
| step:841/1770 train_time:88930ms step_avg:105.74ms | |
| step:842/1770 train_time:89039ms step_avg:105.75ms | |
| step:843/1770 train_time:89148ms step_avg:105.75ms | |
| step:844/1770 train_time:89256ms step_avg:105.75ms | |
| step:845/1770 train_time:89364ms step_avg:105.76ms | |
| step:846/1770 train_time:89474ms step_avg:105.76ms | |
| step:847/1770 train_time:89582ms step_avg:105.76ms | |
| step:848/1770 train_time:89691ms step_avg:105.77ms | |
| step:849/1770 train_time:89799ms step_avg:105.77ms | |
| step:850/1770 train_time:89907ms step_avg:105.77ms | |
| step:851/1770 train_time:90015ms step_avg:105.78ms | |
| step:852/1770 train_time:90124ms step_avg:105.78ms | |
| step:853/1770 train_time:90232ms step_avg:105.78ms | |
| step:854/1770 train_time:90341ms step_avg:105.79ms | |
| step:855/1770 train_time:90449ms step_avg:105.79ms | |
| step:856/1770 train_time:90559ms step_avg:105.79ms | |
| step:857/1770 train_time:90667ms step_avg:105.80ms | |
| step:858/1770 train_time:90775ms step_avg:105.80ms | |
| step:859/1770 train_time:90883ms step_avg:105.80ms | |
| step:860/1770 train_time:90992ms step_avg:105.80ms | |
| step:861/1770 train_time:91101ms step_avg:105.81ms | |
| step:862/1770 train_time:91209ms step_avg:105.81ms | |
| step:863/1770 train_time:91317ms step_avg:105.81ms | |
| step:864/1770 train_time:91425ms step_avg:105.82ms | |
| step:865/1770 train_time:91534ms step_avg:105.82ms | |
| step:866/1770 train_time:91643ms step_avg:105.82ms | |
| step:867/1770 train_time:91751ms step_avg:105.83ms | |
| step:868/1770 train_time:91860ms step_avg:105.83ms | |
| step:869/1770 train_time:91969ms step_avg:105.83ms | |
| step:870/1770 train_time:92078ms step_avg:105.84ms | |
| step:871/1770 train_time:92187ms step_avg:105.84ms | |
| step:872/1770 train_time:92295ms step_avg:105.84ms | |
| step:873/1770 train_time:92404ms step_avg:105.85ms | |
| step:874/1770 train_time:92513ms step_avg:105.85ms | |
| step:875/1770 train_time:92621ms step_avg:105.85ms | |
| step:875/1770 val_loss:4.1576 train_time:92625ms step_avg:105.86ms | |
| step:876/1770 train_time:92736ms step_avg:105.86ms | |
| step:877/1770 train_time:92844ms step_avg:105.87ms | |
| step:878/1770 train_time:92953ms step_avg:105.87ms | |
| step:879/1770 train_time:93062ms step_avg:105.87ms | |
| step:880/1770 train_time:93170ms step_avg:105.88ms | |
| step:881/1770 train_time:93279ms step_avg:105.88ms | |
| step:882/1770 train_time:93387ms step_avg:105.88ms | |
| step:883/1770 train_time:93496ms step_avg:105.88ms | |
| step:884/1770 train_time:93604ms step_avg:105.89ms | |
| step:885/1770 train_time:93713ms step_avg:105.89ms | |
| step:886/1770 train_time:93821ms step_avg:105.89ms | |
| step:887/1770 train_time:93929ms step_avg:105.90ms | |
| step:888/1770 train_time:94038ms step_avg:105.90ms | |
| step:889/1770 train_time:94147ms step_avg:105.90ms | |
| step:890/1770 train_time:94256ms step_avg:105.91ms | |
| step:891/1770 train_time:94364ms step_avg:105.91ms | |
| step:892/1770 train_time:94472ms step_avg:105.91ms | |
| step:893/1770 train_time:94585ms step_avg:105.92ms | |
| step:894/1770 train_time:94691ms step_avg:105.92ms | |
| step:895/1770 train_time:94800ms step_avg:105.92ms | |
| step:896/1770 train_time:94909ms step_avg:105.93ms | |
| step:897/1770 train_time:95018ms step_avg:105.93ms | |
| step:898/1770 train_time:95127ms step_avg:105.93ms | |
| step:899/1770 train_time:95235ms step_avg:105.93ms | |
| step:900/1770 train_time:95344ms step_avg:105.94ms | |
| step:901/1770 train_time:95453ms step_avg:105.94ms | |
| step:902/1770 train_time:95561ms step_avg:105.94ms | |
| step:903/1770 train_time:95670ms step_avg:105.95ms | |
| step:904/1770 train_time:95779ms step_avg:105.95ms | |
| step:905/1770 train_time:95888ms step_avg:105.95ms | |
| step:906/1770 train_time:95996ms step_avg:105.96ms | |
| step:907/1770 train_time:96105ms step_avg:105.96ms | |
| step:908/1770 train_time:96213ms step_avg:105.96ms | |
| step:909/1770 train_time:96321ms step_avg:105.96ms | |
| step:910/1770 train_time:96432ms step_avg:105.97ms | |
| step:911/1770 train_time:96539ms step_avg:105.97ms | |
| step:912/1770 train_time:96648ms step_avg:105.97ms | |
| step:913/1770 train_time:96756ms step_avg:105.98ms | |
| step:914/1770 train_time:96865ms step_avg:105.98ms | |
| step:915/1770 train_time:96975ms step_avg:105.98ms | |
| step:916/1770 train_time:97083ms step_avg:105.99ms | |
| step:917/1770 train_time:97191ms step_avg:105.99ms | |
| step:918/1770 train_time:97299ms step_avg:105.99ms | |
| step:919/1770 train_time:97409ms step_avg:105.99ms | |
| step:920/1770 train_time:97519ms step_avg:106.00ms | |
| step:921/1770 train_time:97628ms step_avg:106.00ms | |
| step:922/1770 train_time:97739ms step_avg:106.01ms | |
| step:923/1770 train_time:97849ms step_avg:106.01ms | |
| step:924/1770 train_time:97959ms step_avg:106.02ms | |
| step:925/1770 train_time:98069ms step_avg:106.02ms | |
| step:926/1770 train_time:98179ms step_avg:106.02ms | |
| step:927/1770 train_time:98290ms step_avg:106.03ms | |
| step:928/1770 train_time:98400ms step_avg:106.03ms | |
| step:929/1770 train_time:98510ms step_avg:106.04ms | |
| step:930/1770 train_time:98621ms step_avg:106.04ms | |
| step:931/1770 train_time:98729ms step_avg:106.05ms | |
| step:932/1770 train_time:98840ms step_avg:106.05ms | |
| step:933/1770 train_time:98949ms step_avg:106.06ms | |
| step:934/1770 train_time:99060ms step_avg:106.06ms | |
| step:935/1770 train_time:99170ms step_avg:106.06ms | |
| step:936/1770 train_time:99280ms step_avg:106.07ms | |
| step:937/1770 train_time:99392ms step_avg:106.07ms | |
| step:938/1770 train_time:99501ms step_avg:106.08ms | |
| step:939/1770 train_time:99612ms step_avg:106.08ms | |
| step:940/1770 train_time:99722ms step_avg:106.09ms | |
| step:941/1770 train_time:99832ms step_avg:106.09ms | |
| step:942/1770 train_time:99942ms step_avg:106.10ms | |
| step:943/1770 train_time:100052ms step_avg:106.10ms | |
| step:944/1770 train_time:100162ms step_avg:106.10ms | |
| step:945/1770 train_time:100272ms step_avg:106.11ms | |
| step:946/1770 train_time:100383ms step_avg:106.11ms | |
| step:947/1770 train_time:100493ms step_avg:106.12ms | |
| step:948/1770 train_time:100604ms step_avg:106.12ms | |
| step:949/1770 train_time:100713ms step_avg:106.13ms | |
| step:950/1770 train_time:100823ms step_avg:106.13ms | |
| step:951/1770 train_time:100932ms step_avg:106.13ms | |
| step:952/1770 train_time:101042ms step_avg:106.14ms | |
| step:953/1770 train_time:101153ms step_avg:106.14ms | |
| step:954/1770 train_time:101263ms step_avg:106.15ms | |
| step:955/1770 train_time:101374ms step_avg:106.15ms | |
| step:956/1770 train_time:101484ms step_avg:106.16ms | |
| step:957/1770 train_time:101595ms step_avg:106.16ms | |
| step:958/1770 train_time:101705ms step_avg:106.16ms | |
| step:959/1770 train_time:101816ms step_avg:106.17ms | |
| step:960/1770 train_time:101926ms step_avg:106.17ms | |
| step:961/1770 train_time:102036ms step_avg:106.18ms | |
| step:962/1770 train_time:102146ms step_avg:106.18ms | |
| step:963/1770 train_time:102256ms step_avg:106.19ms | |
| step:964/1770 train_time:102366ms step_avg:106.19ms | |
| step:965/1770 train_time:102477ms step_avg:106.19ms | |
| step:966/1770 train_time:102588ms step_avg:106.20ms | |
| step:967/1770 train_time:102696ms step_avg:106.20ms | |
| step:968/1770 train_time:102805ms step_avg:106.20ms | |
| step:969/1770 train_time:102914ms step_avg:106.21ms | |
| step:970/1770 train_time:103024ms step_avg:106.21ms | |
| step:971/1770 train_time:103135ms step_avg:106.22ms | |
| step:972/1770 train_time:103245ms step_avg:106.22ms | |
| step:973/1770 train_time:103355ms step_avg:106.22ms | |
| step:974/1770 train_time:103465ms step_avg:106.23ms | |
| step:975/1770 train_time:103575ms step_avg:106.23ms | |
| step:976/1770 train_time:103685ms step_avg:106.23ms | |
| step:977/1770 train_time:103795ms step_avg:106.24ms | |
| step:978/1770 train_time:103905ms step_avg:106.24ms | |
| step:979/1770 train_time:104014ms step_avg:106.25ms | |
| step:980/1770 train_time:104124ms step_avg:106.25ms | |
| step:981/1770 train_time:104234ms step_avg:106.25ms | |
| step:982/1770 train_time:104344ms step_avg:106.26ms | |
| step:983/1770 train_time:104455ms step_avg:106.26ms | |
| step:984/1770 train_time:104565ms step_avg:106.27ms | |
| step:985/1770 train_time:104675ms step_avg:106.27ms | |
| step:986/1770 train_time:104786ms step_avg:106.27ms | |
| step:987/1770 train_time:104896ms step_avg:106.28ms | |
| step:988/1770 train_time:105007ms step_avg:106.28ms | |
| step:989/1770 train_time:105117ms step_avg:106.29ms | |
| step:990/1770 train_time:105227ms step_avg:106.29ms | |
| step:991/1770 train_time:105338ms step_avg:106.29ms | |
| step:992/1770 train_time:105448ms step_avg:106.30ms | |
| step:993/1770 train_time:105558ms step_avg:106.30ms | |
| step:994/1770 train_time:105668ms step_avg:106.31ms | |
| step:995/1770 train_time:105779ms step_avg:106.31ms | |
| step:996/1770 train_time:105889ms step_avg:106.31ms | |
| step:997/1770 train_time:105999ms step_avg:106.32ms | |
| step:998/1770 train_time:106109ms step_avg:106.32ms | |
| step:999/1770 train_time:106220ms step_avg:106.33ms | |
| step:1000/1770 train_time:106330ms step_avg:106.33ms | |
| step:1000/1770 val_loss:4.0869 train_time:106335ms step_avg:106.33ms | |
| step:1001/1770 train_time:106450ms step_avg:106.34ms | |
| step:1002/1770 train_time:106560ms step_avg:106.35ms | |
| step:1003/1770 train_time:106670ms step_avg:106.35ms | |
| step:1004/1770 train_time:106780ms step_avg:106.35ms | |
| step:1005/1770 train_time:106890ms step_avg:106.36ms | |
| step:1006/1770 train_time:107000ms step_avg:106.36ms | |
| step:1007/1770 train_time:107110ms step_avg:106.37ms | |
| step:1008/1770 train_time:107219ms step_avg:106.37ms | |
| step:1009/1770 train_time:107330ms step_avg:106.37ms | |
| step:1010/1770 train_time:107440ms step_avg:106.38ms | |
| step:1011/1770 train_time:107550ms step_avg:106.38ms | |
| step:1012/1770 train_time:107659ms step_avg:106.38ms | |
| step:1013/1770 train_time:107768ms step_avg:106.38ms | |
| step:1014/1770 train_time:107877ms step_avg:106.39ms | |
| step:1015/1770 train_time:107986ms step_avg:106.39ms | |
| step:1016/1770 train_time:108097ms step_avg:106.39ms | |
| step:1017/1770 train_time:108207ms step_avg:106.40ms | |
| step:1018/1770 train_time:108317ms step_avg:106.40ms | |
| step:1019/1770 train_time:108427ms step_avg:106.41ms | |
| step:1020/1770 train_time:108537ms step_avg:106.41ms | |
| step:1021/1770 train_time:108647ms step_avg:106.41ms | |
| step:1022/1770 train_time:108757ms step_avg:106.42ms | |
| step:1023/1770 train_time:108867ms step_avg:106.42ms | |
| step:1024/1770 train_time:108977ms step_avg:106.42ms | |
| step:1025/1770 train_time:109087ms step_avg:106.43ms | |
| step:1026/1770 train_time:109197ms step_avg:106.43ms | |
| step:1027/1770 train_time:109306ms step_avg:106.43ms | |
| step:1028/1770 train_time:109417ms step_avg:106.44ms | |
| step:1029/1770 train_time:109527ms step_avg:106.44ms | |
| step:1030/1770 train_time:109638ms step_avg:106.44ms | |
| step:1031/1770 train_time:109747ms step_avg:106.45ms | |
| step:1032/1770 train_time:109857ms step_avg:106.45ms | |
| step:1033/1770 train_time:109967ms step_avg:106.45ms | |
| step:1034/1770 train_time:110078ms step_avg:106.46ms | |
| step:1035/1770 train_time:110187ms step_avg:106.46ms | |
| step:1036/1770 train_time:110297ms step_avg:106.46ms | |
| step:1037/1770 train_time:110407ms step_avg:106.47ms | |
| step:1038/1770 train_time:110517ms step_avg:106.47ms | |
| step:1039/1770 train_time:110627ms step_avg:106.47ms | |
| step:1040/1770 train_time:110739ms step_avg:106.48ms | |
| step:1041/1770 train_time:110847ms step_avg:106.48ms | |
| step:1042/1770 train_time:110958ms step_avg:106.49ms | |
| step:1043/1770 train_time:111067ms step_avg:106.49ms | |
| step:1044/1770 train_time:111178ms step_avg:106.49ms | |
| step:1045/1770 train_time:111287ms step_avg:106.49ms | |
| step:1046/1770 train_time:111397ms step_avg:106.50ms | |
| step:1047/1770 train_time:111507ms step_avg:106.50ms | |
| step:1048/1770 train_time:111617ms step_avg:106.50ms | |
| step:1049/1770 train_time:111727ms step_avg:106.51ms | |
| step:1050/1770 train_time:111838ms step_avg:106.51ms | |
| step:1051/1770 train_time:111949ms step_avg:106.52ms | |
| step:1052/1770 train_time:112058ms step_avg:106.52ms | |
| step:1053/1770 train_time:112168ms step_avg:106.52ms | |
| step:1054/1770 train_time:112277ms step_avg:106.53ms | |
| step:1055/1770 train_time:112387ms step_avg:106.53ms | |
| step:1056/1770 train_time:112497ms step_avg:106.53ms | |
| step:1057/1770 train_time:112608ms step_avg:106.54ms | |
| step:1058/1770 train_time:112717ms step_avg:106.54ms | |
| step:1059/1770 train_time:112828ms step_avg:106.54ms | |
| step:1060/1770 train_time:112938ms step_avg:106.55ms | |
| step:1061/1770 train_time:113048ms step_avg:106.55ms | |
| step:1062/1770 train_time:113163ms step_avg:106.56ms | |
| step:1063/1770 train_time:113269ms step_avg:106.56ms | |
| step:1064/1770 train_time:113380ms step_avg:106.56ms | |
| step:1065/1770 train_time:113490ms step_avg:106.56ms | |
| step:1066/1770 train_time:113602ms step_avg:106.57ms | |
| step:1067/1770 train_time:113714ms step_avg:106.57ms | |
| step:1068/1770 train_time:113824ms step_avg:106.58ms | |
| step:1069/1770 train_time:113935ms step_avg:106.58ms | |
| step:1070/1770 train_time:114045ms step_avg:106.58ms | |
| step:1071/1770 train_time:114156ms step_avg:106.59ms | |
| step:1072/1770 train_time:114267ms step_avg:106.59ms | |
| step:1073/1770 train_time:114378ms step_avg:106.60ms | |
| step:1074/1770 train_time:114489ms step_avg:106.60ms | |
| step:1075/1770 train_time:114598ms step_avg:106.60ms | |
| step:1076/1770 train_time:114709ms step_avg:106.61ms | |
| step:1077/1770 train_time:114820ms step_avg:106.61ms | |
| step:1078/1770 train_time:114930ms step_avg:106.61ms | |
| step:1079/1770 train_time:115041ms step_avg:106.62ms | |
| step:1080/1770 train_time:115154ms step_avg:106.62ms | |
| step:1081/1770 train_time:115261ms step_avg:106.62ms | |
| step:1082/1770 train_time:115371ms step_avg:106.63ms | |
| step:1083/1770 train_time:115481ms step_avg:106.63ms | |
| step:1084/1770 train_time:115591ms step_avg:106.63ms | |
| step:1085/1770 train_time:115702ms step_avg:106.64ms | |
| step:1086/1770 train_time:115812ms step_avg:106.64ms | |
| step:1087/1770 train_time:115921ms step_avg:106.64ms | |
| step:1088/1770 train_time:116031ms step_avg:106.65ms | |
| step:1089/1770 train_time:116141ms step_avg:106.65ms | |
| step:1090/1770 train_time:116251ms step_avg:106.65ms | |
| step:1091/1770 train_time:116361ms step_avg:106.66ms | |
| step:1092/1770 train_time:116472ms step_avg:106.66ms | |
| step:1093/1770 train_time:116582ms step_avg:106.66ms | |
| step:1094/1770 train_time:116692ms step_avg:106.67ms | |
| step:1095/1770 train_time:116803ms step_avg:106.67ms | |
| step:1096/1770 train_time:116912ms step_avg:106.67ms | |
| step:1097/1770 train_time:117024ms step_avg:106.68ms | |
| step:1098/1770 train_time:117137ms step_avg:106.68ms | |
| step:1099/1770 train_time:117246ms step_avg:106.68ms | |
| step:1100/1770 train_time:117355ms step_avg:106.69ms | |
| step:1101/1770 train_time:117465ms step_avg:106.69ms | |
| step:1102/1770 train_time:117575ms step_avg:106.69ms | |
| step:1103/1770 train_time:117686ms step_avg:106.70ms | |
| step:1104/1770 train_time:117796ms step_avg:106.70ms | |
| step:1105/1770 train_time:117907ms step_avg:106.70ms | |
| step:1106/1770 train_time:118017ms step_avg:106.71ms | |
| step:1107/1770 train_time:118128ms step_avg:106.71ms | |
| step:1108/1770 train_time:118238ms step_avg:106.71ms | |
| step:1109/1770 train_time:118348ms step_avg:106.72ms | |
| step:1110/1770 train_time:118458ms step_avg:106.72ms | |
| step:1111/1770 train_time:118571ms step_avg:106.72ms | |
| step:1112/1770 train_time:118678ms step_avg:106.73ms | |
| step:1113/1770 train_time:118789ms step_avg:106.73ms | |
| step:1114/1770 train_time:118899ms step_avg:106.73ms | |
| step:1115/1770 train_time:119009ms step_avg:106.73ms | |
| step:1116/1770 train_time:119120ms step_avg:106.74ms | |
| step:1117/1770 train_time:119232ms step_avg:106.74ms | |
| step:1118/1770 train_time:119340ms step_avg:106.74ms | |
| step:1119/1770 train_time:119450ms step_avg:106.75ms | |
| step:1120/1770 train_time:119560ms step_avg:106.75ms | |
| step:1121/1770 train_time:119670ms step_avg:106.75ms | |
| step:1122/1770 train_time:119781ms step_avg:106.76ms | |
| step:1123/1770 train_time:119890ms step_avg:106.76ms | |
| step:1124/1770 train_time:120000ms step_avg:106.76ms | |
| step:1125/1770 train_time:120110ms step_avg:106.76ms | |
| step:1125/1770 val_loss:4.0363 train_time:120115ms step_avg:106.77ms | |
| step:1126/1770 train_time:120226ms step_avg:106.77ms | |
| step:1127/1770 train_time:120336ms step_avg:106.78ms | |
| step:1128/1770 train_time:120447ms step_avg:106.78ms | |
| step:1129/1770 train_time:120558ms step_avg:106.78ms | |
| step:1130/1770 train_time:120668ms step_avg:106.79ms | |
| step:1131/1770 train_time:120777ms step_avg:106.79ms | |
| step:1132/1770 train_time:120887ms step_avg:106.79ms | |
| step:1133/1770 train_time:120998ms step_avg:106.79ms | |
| step:1134/1770 train_time:121108ms step_avg:106.80ms | |
| step:1135/1770 train_time:121219ms step_avg:106.80ms | |
| step:1136/1770 train_time:121329ms step_avg:106.80ms | |
| step:1137/1770 train_time:121439ms step_avg:106.81ms | |
| step:1138/1770 train_time:121549ms step_avg:106.81ms | |
| step:1139/1770 train_time:121660ms step_avg:106.81ms | |
| step:1140/1770 train_time:121770ms step_avg:106.82ms | |
| step:1141/1770 train_time:121880ms step_avg:106.82ms | |
| step:1142/1770 train_time:121990ms step_avg:106.82ms | |
| step:1143/1770 train_time:122101ms step_avg:106.82ms | |
| step:1144/1770 train_time:122211ms step_avg:106.83ms | |
| step:1145/1770 train_time:122321ms step_avg:106.83ms | |
| step:1146/1770 train_time:122431ms step_avg:106.83ms | |
| step:1147/1770 train_time:122542ms step_avg:106.84ms | |
| step:1148/1770 train_time:122652ms step_avg:106.84ms | |
| step:1149/1770 train_time:122764ms step_avg:106.84ms | |
| step:1150/1770 train_time:122873ms step_avg:106.85ms | |
| step:1151/1770 train_time:122983ms step_avg:106.85ms | |
| step:1152/1770 train_time:123093ms step_avg:106.85ms | |
| step:1153/1770 train_time:123203ms step_avg:106.85ms | |
| step:1154/1770 train_time:123313ms step_avg:106.86ms | |
| step:1155/1770 train_time:123425ms step_avg:106.86ms | |
| step:1156/1770 train_time:123536ms step_avg:106.86ms | |
| step:1157/1770 train_time:123645ms step_avg:106.87ms | |
| step:1158/1770 train_time:123754ms step_avg:106.87ms | |
| step:1159/1770 train_time:123864ms step_avg:106.87ms | |
| step:1160/1770 train_time:123975ms step_avg:106.87ms | |
| step:1161/1770 train_time:124085ms step_avg:106.88ms | |
| step:1162/1770 train_time:124195ms step_avg:106.88ms | |
| step:1163/1770 train_time:124305ms step_avg:106.88ms | |
| step:1164/1770 train_time:124416ms step_avg:106.89ms | |
| step:1165/1770 train_time:124527ms step_avg:106.89ms | |
| step:1166/1770 train_time:124637ms step_avg:106.89ms | |
| step:1167/1770 train_time:124748ms step_avg:106.90ms | |
| step:1168/1770 train_time:124860ms step_avg:106.90ms | |
| step:1169/1770 train_time:124968ms step_avg:106.90ms | |
| step:1170/1770 train_time:125078ms step_avg:106.90ms | |
| step:1171/1770 train_time:125188ms step_avg:106.91ms | |
| step:1172/1770 train_time:125298ms step_avg:106.91ms | |
| step:1173/1770 train_time:125408ms step_avg:106.91ms | |
| step:1174/1770 train_time:125519ms step_avg:106.92ms | |
| step:1175/1770 train_time:125629ms step_avg:106.92ms | |
| step:1176/1770 train_time:125741ms step_avg:106.92ms | |
| step:1177/1770 train_time:125851ms step_avg:106.93ms | |
| step:1178/1770 train_time:125959ms step_avg:106.93ms | |
| step:1179/1770 train_time:126069ms step_avg:106.93ms | |
| step:1180/1770 train_time:126180ms step_avg:106.93ms | |
| step:1181/1770 train_time:126289ms step_avg:106.93ms | |
| step:1182/1770 train_time:126401ms step_avg:106.94ms | |
| step:1183/1770 train_time:126512ms step_avg:106.94ms | |
| step:1184/1770 train_time:126623ms step_avg:106.95ms | |
| step:1185/1770 train_time:126735ms step_avg:106.95ms | |
| step:1186/1770 train_time:126847ms step_avg:106.95ms | |
| step:1187/1770 train_time:126959ms step_avg:106.96ms | |
| step:1188/1770 train_time:127071ms step_avg:106.96ms | |
| step:1189/1770 train_time:127181ms step_avg:106.96ms | |
| step:1190/1770 train_time:127294ms step_avg:106.97ms | |
| step:1191/1770 train_time:127405ms step_avg:106.97ms | |
| step:1192/1770 train_time:127518ms step_avg:106.98ms | |
| step:1193/1770 train_time:127629ms step_avg:106.98ms | |
| step:1194/1770 train_time:127740ms step_avg:106.99ms | |
| step:1195/1770 train_time:127852ms step_avg:106.99ms | |
| step:1196/1770 train_time:127964ms step_avg:106.99ms | |
| step:1197/1770 train_time:128075ms step_avg:107.00ms | |
| step:1198/1770 train_time:128186ms step_avg:107.00ms | |
| step:1199/1770 train_time:128297ms step_avg:107.00ms | |
| step:1200/1770 train_time:128410ms step_avg:107.01ms | |
| step:1201/1770 train_time:128521ms step_avg:107.01ms | |
| step:1202/1770 train_time:128634ms step_avg:107.02ms | |
| step:1203/1770 train_time:128746ms step_avg:107.02ms | |
| step:1204/1770 train_time:128859ms step_avg:107.03ms | |
| step:1205/1770 train_time:128972ms step_avg:107.03ms | |
| step:1206/1770 train_time:129086ms step_avg:107.04ms | |
| step:1207/1770 train_time:129195ms step_avg:107.04ms | |
| step:1208/1770 train_time:129308ms step_avg:107.04ms | |
| step:1209/1770 train_time:129417ms step_avg:107.04ms | |
| step:1210/1770 train_time:129530ms step_avg:107.05ms | |
| step:1211/1770 train_time:129641ms step_avg:107.05ms | |
| step:1212/1770 train_time:129752ms step_avg:107.06ms | |
| step:1213/1770 train_time:129863ms step_avg:107.06ms | |
| step:1214/1770 train_time:129975ms step_avg:107.06ms | |
| step:1215/1770 train_time:130085ms step_avg:107.07ms | |
| step:1216/1770 train_time:130196ms step_avg:107.07ms | |
| step:1217/1770 train_time:130306ms step_avg:107.07ms | |
| step:1218/1770 train_time:130417ms step_avg:107.07ms | |
| step:1219/1770 train_time:130529ms step_avg:107.08ms | |
| step:1220/1770 train_time:130641ms step_avg:107.08ms | |
| step:1221/1770 train_time:130752ms step_avg:107.09ms | |
| step:1222/1770 train_time:130864ms step_avg:107.09ms | |
| step:1223/1770 train_time:130976ms step_avg:107.09ms | |
| step:1224/1770 train_time:131087ms step_avg:107.10ms | |
| step:1225/1770 train_time:131199ms step_avg:107.10ms | |
| step:1226/1770 train_time:131312ms step_avg:107.11ms | |
| step:1227/1770 train_time:131424ms step_avg:107.11ms | |
| step:1228/1770 train_time:131535ms step_avg:107.11ms | |
| step:1229/1770 train_time:131647ms step_avg:107.12ms | |
| step:1230/1770 train_time:131759ms step_avg:107.12ms | |
| step:1231/1770 train_time:131870ms step_avg:107.12ms | |
| step:1232/1770 train_time:131981ms step_avg:107.13ms | |
| step:1233/1770 train_time:132092ms step_avg:107.13ms | |
| step:1234/1770 train_time:132203ms step_avg:107.13ms | |
| step:1235/1770 train_time:132314ms step_avg:107.14ms | |
| step:1236/1770 train_time:132425ms step_avg:107.14ms | |
| step:1237/1770 train_time:132536ms step_avg:107.14ms | |
| step:1238/1770 train_time:132647ms step_avg:107.15ms | |
| step:1239/1770 train_time:132758ms step_avg:107.15ms | |
| step:1240/1770 train_time:132869ms step_avg:107.15ms | |
| step:1241/1770 train_time:132982ms step_avg:107.16ms | |
| step:1242/1770 train_time:133092ms step_avg:107.16ms | |
| step:1243/1770 train_time:133204ms step_avg:107.16ms | |
| step:1244/1770 train_time:133316ms step_avg:107.17ms | |
| step:1245/1770 train_time:133428ms step_avg:107.17ms | |
| step:1246/1770 train_time:133540ms step_avg:107.18ms | |
| step:1247/1770 train_time:133651ms step_avg:107.18ms | |
| step:1248/1770 train_time:133763ms step_avg:107.18ms | |
| step:1249/1770 train_time:133876ms step_avg:107.19ms | |
| step:1250/1770 train_time:133988ms step_avg:107.19ms | |
| step:1250/1770 val_loss:3.9675 train_time:133992ms step_avg:107.19ms | |
| step:1251/1770 train_time:134105ms step_avg:107.20ms | |
| step:1252/1770 train_time:134217ms step_avg:107.20ms | |
| step:1253/1770 train_time:134328ms step_avg:107.21ms | |
| step:1254/1770 train_time:134439ms step_avg:107.21ms | |
| step:1255/1770 train_time:134551ms step_avg:107.21ms | |
| step:1256/1770 train_time:134661ms step_avg:107.21ms | |
| step:1257/1770 train_time:134772ms step_avg:107.22ms | |
| step:1258/1770 train_time:134883ms step_avg:107.22ms | |
| step:1259/1770 train_time:134995ms step_avg:107.22ms | |
| step:1260/1770 train_time:135107ms step_avg:107.23ms | |
| step:1261/1770 train_time:135218ms step_avg:107.23ms | |
| step:1262/1770 train_time:135330ms step_avg:107.23ms | |
| step:1263/1770 train_time:135441ms step_avg:107.24ms | |
| step:1264/1770 train_time:135556ms step_avg:107.24ms | |
| step:1265/1770 train_time:135664ms step_avg:107.24ms | |
| step:1266/1770 train_time:135775ms step_avg:107.25ms | |
| step:1267/1770 train_time:135888ms step_avg:107.25ms | |
| step:1268/1770 train_time:136000ms step_avg:107.26ms | |
| step:1269/1770 train_time:136112ms step_avg:107.26ms | |
| step:1270/1770 train_time:136222ms step_avg:107.26ms | |
| step:1271/1770 train_time:136335ms step_avg:107.27ms | |
| step:1272/1770 train_time:136446ms step_avg:107.27ms | |
| step:1273/1770 train_time:136557ms step_avg:107.27ms | |
| step:1274/1770 train_time:136667ms step_avg:107.27ms | |
| step:1275/1770 train_time:136780ms step_avg:107.28ms | |
| step:1276/1770 train_time:136891ms step_avg:107.28ms | |
| step:1277/1770 train_time:137002ms step_avg:107.28ms | |
| step:1278/1770 train_time:137115ms step_avg:107.29ms | |
| step:1279/1770 train_time:137227ms step_avg:107.29ms | |
| step:1280/1770 train_time:137338ms step_avg:107.30ms | |
| step:1281/1770 train_time:137449ms step_avg:107.30ms | |
| step:1282/1770 train_time:137561ms step_avg:107.30ms | |
| step:1283/1770 train_time:137673ms step_avg:107.31ms | |
| step:1284/1770 train_time:137784ms step_avg:107.31ms | |
| step:1285/1770 train_time:137896ms step_avg:107.31ms | |
| step:1286/1770 train_time:138008ms step_avg:107.32ms | |
| step:1287/1770 train_time:138119ms step_avg:107.32ms | |
| step:1288/1770 train_time:138230ms step_avg:107.32ms | |
| step:1289/1770 train_time:138341ms step_avg:107.32ms | |
| step:1290/1770 train_time:138452ms step_avg:107.33ms | |
| step:1291/1770 train_time:138564ms step_avg:107.33ms | |
| step:1292/1770 train_time:138674ms step_avg:107.33ms | |
| step:1293/1770 train_time:138785ms step_avg:107.34ms | |
| step:1294/1770 train_time:138897ms step_avg:107.34ms | |
| step:1295/1770 train_time:139009ms step_avg:107.34ms | |
| step:1296/1770 train_time:139120ms step_avg:107.35ms | |
| step:1297/1770 train_time:139232ms step_avg:107.35ms | |
| step:1298/1770 train_time:139344ms step_avg:107.35ms | |
| step:1299/1770 train_time:139455ms step_avg:107.36ms | |
| step:1300/1770 train_time:139565ms step_avg:107.36ms | |
| step:1301/1770 train_time:139678ms step_avg:107.36ms | |
| step:1302/1770 train_time:139789ms step_avg:107.36ms | |
| step:1303/1770 train_time:139901ms step_avg:107.37ms | |
| step:1304/1770 train_time:140014ms step_avg:107.37ms | |
| step:1305/1770 train_time:140124ms step_avg:107.37ms | |
| step:1306/1770 train_time:140235ms step_avg:107.38ms | |
| step:1307/1770 train_time:140346ms step_avg:107.38ms | |
| step:1308/1770 train_time:140456ms step_avg:107.38ms | |
| step:1309/1770 train_time:140568ms step_avg:107.39ms | |
| step:1310/1770 train_time:140681ms step_avg:107.39ms | |
| step:1311/1770 train_time:140792ms step_avg:107.39ms | |
| step:1312/1770 train_time:140903ms step_avg:107.40ms | |
| step:1313/1770 train_time:141015ms step_avg:107.40ms | |
| step:1314/1770 train_time:141128ms step_avg:107.40ms | |
| step:1315/1770 train_time:141239ms step_avg:107.41ms | |
| step:1316/1770 train_time:141350ms step_avg:107.41ms | |
| step:1317/1770 train_time:141462ms step_avg:107.41ms | |
| step:1318/1770 train_time:141573ms step_avg:107.42ms | |
| step:1319/1770 train_time:141685ms step_avg:107.42ms | |
| step:1320/1770 train_time:141797ms step_avg:107.42ms | |
| step:1321/1770 train_time:141908ms step_avg:107.42ms | |
| step:1322/1770 train_time:142019ms step_avg:107.43ms | |
| step:1323/1770 train_time:142131ms step_avg:107.43ms | |
| step:1324/1770 train_time:142242ms step_avg:107.43ms | |
| step:1325/1770 train_time:142353ms step_avg:107.44ms | |
| step:1326/1770 train_time:142465ms step_avg:107.44ms | |
| step:1327/1770 train_time:142577ms step_avg:107.44ms | |
| step:1328/1770 train_time:142688ms step_avg:107.45ms | |
| step:1329/1770 train_time:142800ms step_avg:107.45ms | |
| step:1330/1770 train_time:142911ms step_avg:107.45ms | |
| step:1331/1770 train_time:143022ms step_avg:107.45ms | |
| step:1332/1770 train_time:143138ms step_avg:107.46ms | |
| step:1333/1770 train_time:143245ms step_avg:107.46ms | |
| step:1334/1770 train_time:143357ms step_avg:107.46ms | |
| step:1335/1770 train_time:143468ms step_avg:107.47ms | |
| step:1336/1770 train_time:143580ms step_avg:107.47ms | |
| step:1337/1770 train_time:143691ms step_avg:107.47ms | |
| step:1338/1770 train_time:143803ms step_avg:107.48ms | |
| step:1339/1770 train_time:143915ms step_avg:107.48ms | |
| step:1340/1770 train_time:144028ms step_avg:107.48ms | |
| step:1341/1770 train_time:144140ms step_avg:107.49ms | |
| step:1342/1770 train_time:144252ms step_avg:107.49ms | |
| step:1343/1770 train_time:144362ms step_avg:107.49ms | |
| step:1344/1770 train_time:144474ms step_avg:107.50ms | |
| step:1345/1770 train_time:144585ms step_avg:107.50ms | |
| step:1346/1770 train_time:144697ms step_avg:107.50ms | |
| step:1347/1770 train_time:144808ms step_avg:107.50ms | |
| step:1348/1770 train_time:144921ms step_avg:107.51ms | |
| step:1349/1770 train_time:145032ms step_avg:107.51ms | |
| step:1350/1770 train_time:145142ms step_avg:107.51ms | |
| step:1351/1770 train_time:145253ms step_avg:107.52ms | |
| step:1352/1770 train_time:145365ms step_avg:107.52ms | |
| step:1353/1770 train_time:145477ms step_avg:107.52ms | |
| step:1354/1770 train_time:145589ms step_avg:107.53ms | |
| step:1355/1770 train_time:145701ms step_avg:107.53ms | |
| step:1356/1770 train_time:145813ms step_avg:107.53ms | |
| step:1357/1770 train_time:145925ms step_avg:107.54ms | |
| step:1358/1770 train_time:146038ms step_avg:107.54ms | |
| step:1359/1770 train_time:146149ms step_avg:107.54ms | |
| step:1360/1770 train_time:146260ms step_avg:107.54ms | |
| step:1361/1770 train_time:146371ms step_avg:107.55ms | |
| step:1362/1770 train_time:146483ms step_avg:107.55ms | |
| step:1363/1770 train_time:146596ms step_avg:107.55ms | |
| step:1364/1770 train_time:146707ms step_avg:107.56ms | |
| step:1365/1770 train_time:146818ms step_avg:107.56ms | |
| step:1366/1770 train_time:146930ms step_avg:107.56ms | |
| step:1367/1770 train_time:147041ms step_avg:107.56ms | |
| step:1368/1770 train_time:147153ms step_avg:107.57ms | |
| step:1369/1770 train_time:147265ms step_avg:107.57ms | |
| step:1370/1770 train_time:147376ms step_avg:107.57ms | |
| step:1371/1770 train_time:147488ms step_avg:107.58ms | |
| step:1372/1770 train_time:147602ms step_avg:107.58ms | |
| step:1373/1770 train_time:147711ms step_avg:107.58ms | |
| step:1374/1770 train_time:147822ms step_avg:107.59ms | |
| step:1375/1770 train_time:147934ms step_avg:107.59ms | |
| step:1375/1770 val_loss:3.9062 train_time:147939ms step_avg:107.59ms | |
| step:1376/1770 train_time:148056ms step_avg:107.60ms | |
| step:1377/1770 train_time:148168ms step_avg:107.60ms | |
| step:1378/1770 train_time:148280ms step_avg:107.61ms | |
| step:1379/1770 train_time:148391ms step_avg:107.61ms | |
| step:1380/1770 train_time:148503ms step_avg:107.61ms | |
| step:1381/1770 train_time:148617ms step_avg:107.62ms | |
| step:1382/1770 train_time:148733ms step_avg:107.62ms | |
| step:1383/1770 train_time:148846ms step_avg:107.63ms | |
| step:1384/1770 train_time:148956ms step_avg:107.63ms | |
| step:1385/1770 train_time:149066ms step_avg:107.63ms | |
| step:1386/1770 train_time:149178ms step_avg:107.63ms | |
| step:1387/1770 train_time:149290ms step_avg:107.64ms | |
| step:1388/1770 train_time:149403ms step_avg:107.64ms | |
| step:1389/1770 train_time:149515ms step_avg:107.64ms | |
| step:1390/1770 train_time:149627ms step_avg:107.65ms | |
| step:1391/1770 train_time:149739ms step_avg:107.65ms | |
| step:1392/1770 train_time:149851ms step_avg:107.65ms | |
| step:1393/1770 train_time:149962ms step_avg:107.65ms | |
| step:1394/1770 train_time:150072ms step_avg:107.66ms | |
| step:1395/1770 train_time:150185ms step_avg:107.66ms | |
| step:1396/1770 train_time:150297ms step_avg:107.66ms | |
| step:1397/1770 train_time:150409ms step_avg:107.67ms | |
| step:1398/1770 train_time:150519ms step_avg:107.67ms | |
| step:1399/1770 train_time:150633ms step_avg:107.67ms | |
| step:1400/1770 train_time:150744ms step_avg:107.67ms | |
| step:1401/1770 train_time:150856ms step_avg:107.68ms | |
| step:1402/1770 train_time:150968ms step_avg:107.68ms | |
| step:1403/1770 train_time:151080ms step_avg:107.68ms | |
| step:1404/1770 train_time:151191ms step_avg:107.69ms | |
| step:1405/1770 train_time:151302ms step_avg:107.69ms | |
| step:1406/1770 train_time:151413ms step_avg:107.69ms | |
| step:1407/1770 train_time:151525ms step_avg:107.69ms | |
| step:1408/1770 train_time:151636ms step_avg:107.70ms | |
| step:1409/1770 train_time:151748ms step_avg:107.70ms | |
| step:1410/1770 train_time:151859ms step_avg:107.70ms | |
| step:1411/1770 train_time:151971ms step_avg:107.70ms | |
| step:1412/1770 train_time:152082ms step_avg:107.71ms | |
| step:1413/1770 train_time:152192ms step_avg:107.71ms | |
| step:1414/1770 train_time:152304ms step_avg:107.71ms | |
| step:1415/1770 train_time:152415ms step_avg:107.71ms | |
| step:1416/1770 train_time:152528ms step_avg:107.72ms | |
| step:1417/1770 train_time:152639ms step_avg:107.72ms | |
| step:1418/1770 train_time:152750ms step_avg:107.72ms | |
| step:1419/1770 train_time:152865ms step_avg:107.73ms | |
| step:1420/1770 train_time:152974ms step_avg:107.73ms | |
| step:1421/1770 train_time:153086ms step_avg:107.73ms | |
| step:1422/1770 train_time:153199ms step_avg:107.73ms | |
| step:1423/1770 train_time:153311ms step_avg:107.74ms | |
| step:1424/1770 train_time:153423ms step_avg:107.74ms | |
| step:1425/1770 train_time:153535ms step_avg:107.74ms | |
| step:1426/1770 train_time:153646ms step_avg:107.75ms | |
| step:1427/1770 train_time:153758ms step_avg:107.75ms | |
| step:1428/1770 train_time:153871ms step_avg:107.75ms | |
| step:1429/1770 train_time:153982ms step_avg:107.75ms | |
| step:1430/1770 train_time:154096ms step_avg:107.76ms | |
| step:1431/1770 train_time:154205ms step_avg:107.76ms | |
| step:1432/1770 train_time:154317ms step_avg:107.76ms | |
| step:1433/1770 train_time:154429ms step_avg:107.77ms | |
| step:1434/1770 train_time:154540ms step_avg:107.77ms | |
| step:1435/1770 train_time:154653ms step_avg:107.77ms | |
| step:1436/1770 train_time:154765ms step_avg:107.77ms | |
| step:1437/1770 train_time:154877ms step_avg:107.78ms | |
| step:1438/1770 train_time:154988ms step_avg:107.78ms | |
| step:1439/1770 train_time:155099ms step_avg:107.78ms | |
| step:1440/1770 train_time:155211ms step_avg:107.79ms | |
| step:1441/1770 train_time:155323ms step_avg:107.79ms | |
| step:1442/1770 train_time:155435ms step_avg:107.79ms | |
| step:1443/1770 train_time:155547ms step_avg:107.79ms | |
| step:1444/1770 train_time:155659ms step_avg:107.80ms | |
| step:1445/1770 train_time:155772ms step_avg:107.80ms | |
| step:1446/1770 train_time:155885ms step_avg:107.80ms | |
| step:1447/1770 train_time:155998ms step_avg:107.81ms | |
| step:1448/1770 train_time:156113ms step_avg:107.81ms | |
| step:1449/1770 train_time:156225ms step_avg:107.82ms | |
| step:1450/1770 train_time:156337ms step_avg:107.82ms | |
| step:1451/1770 train_time:156450ms step_avg:107.82ms | |
| step:1452/1770 train_time:156562ms step_avg:107.83ms | |
| step:1453/1770 train_time:156674ms step_avg:107.83ms | |
| step:1454/1770 train_time:156787ms step_avg:107.83ms | |
| step:1455/1770 train_time:156900ms step_avg:107.84ms | |
| step:1456/1770 train_time:157013ms step_avg:107.84ms | |
| step:1457/1770 train_time:157126ms step_avg:107.84ms | |
| step:1458/1770 train_time:157239ms step_avg:107.85ms | |
| step:1459/1770 train_time:157351ms step_avg:107.85ms | |
| step:1460/1770 train_time:157464ms step_avg:107.85ms | |
| step:1461/1770 train_time:157576ms step_avg:107.86ms | |
| step:1462/1770 train_time:157690ms step_avg:107.86ms | |
| step:1463/1770 train_time:157802ms step_avg:107.86ms | |
| step:1464/1770 train_time:157914ms step_avg:107.86ms | |
| step:1465/1770 train_time:158027ms step_avg:107.87ms | |
| step:1466/1770 train_time:158140ms step_avg:107.87ms | |
| step:1467/1770 train_time:158253ms step_avg:107.88ms | |
| step:1468/1770 train_time:158366ms step_avg:107.88ms | |
| step:1469/1770 train_time:158478ms step_avg:107.88ms | |
| step:1470/1770 train_time:158590ms step_avg:107.88ms | |
| step:1471/1770 train_time:158702ms step_avg:107.89ms | |
| step:1472/1770 train_time:158813ms step_avg:107.89ms | |
| step:1473/1770 train_time:158926ms step_avg:107.89ms | |
| step:1474/1770 train_time:159039ms step_avg:107.90ms | |
| step:1475/1770 train_time:159151ms step_avg:107.90ms | |
| step:1476/1770 train_time:159264ms step_avg:107.90ms | |
| step:1477/1770 train_time:159376ms step_avg:107.91ms | |
| step:1478/1770 train_time:159489ms step_avg:107.91ms | |
| step:1479/1770 train_time:159602ms step_avg:107.91ms | |
| step:1480/1770 train_time:159715ms step_avg:107.92ms | |
| step:1481/1770 train_time:159827ms step_avg:107.92ms | |
| step:1482/1770 train_time:159939ms step_avg:107.92ms | |
| step:1483/1770 train_time:160054ms step_avg:107.93ms | |
| step:1484/1770 train_time:160166ms step_avg:107.93ms | |
| step:1485/1770 train_time:160277ms step_avg:107.93ms | |
| step:1486/1770 train_time:160390ms step_avg:107.93ms | |
| step:1487/1770 train_time:160502ms step_avg:107.94ms | |
| step:1488/1770 train_time:160615ms step_avg:107.94ms | |
| step:1489/1770 train_time:160728ms step_avg:107.94ms | |
| step:1490/1770 train_time:160839ms step_avg:107.95ms | |
| step:1491/1770 train_time:160952ms step_avg:107.95ms | |
| step:1492/1770 train_time:161064ms step_avg:107.95ms | |
| step:1493/1770 train_time:161176ms step_avg:107.95ms | |
| step:1494/1770 train_time:161289ms step_avg:107.96ms | |
| step:1495/1770 train_time:161400ms step_avg:107.96ms | |
| step:1496/1770 train_time:161513ms step_avg:107.96ms | |
| step:1497/1770 train_time:161625ms step_avg:107.97ms | |
| step:1498/1770 train_time:161738ms step_avg:107.97ms | |
| step:1499/1770 train_time:161852ms step_avg:107.97ms | |
| step:1500/1770 train_time:161966ms step_avg:107.98ms | |
| step:1500/1770 val_loss:3.8566 train_time:161971ms step_avg:107.98ms | |
| step:1501/1770 train_time:162084ms step_avg:107.98ms | |
| step:1502/1770 train_time:162197ms step_avg:107.99ms | |
| step:1503/1770 train_time:162310ms step_avg:107.99ms | |
| step:1504/1770 train_time:162424ms step_avg:107.99ms | |
| step:1505/1770 train_time:162538ms step_avg:108.00ms | |
| step:1506/1770 train_time:162650ms step_avg:108.00ms | |
| step:1507/1770 train_time:162765ms step_avg:108.01ms | |
| step:1508/1770 train_time:162877ms step_avg:108.01ms | |
| step:1509/1770 train_time:162991ms step_avg:108.01ms | |
| step:1510/1770 train_time:163103ms step_avg:108.02ms | |
| step:1511/1770 train_time:163216ms step_avg:108.02ms | |
| step:1512/1770 train_time:163330ms step_avg:108.02ms | |
| step:1513/1770 train_time:163442ms step_avg:108.02ms | |
| step:1514/1770 train_time:163554ms step_avg:108.03ms | |
| step:1515/1770 train_time:163668ms step_avg:108.03ms | |
| step:1516/1770 train_time:163781ms step_avg:108.03ms | |
| step:1517/1770 train_time:163893ms step_avg:108.04ms | |
| step:1518/1770 train_time:164006ms step_avg:108.04ms | |
| step:1519/1770 train_time:164119ms step_avg:108.04ms | |
| step:1520/1770 train_time:164232ms step_avg:108.05ms | |
| step:1521/1770 train_time:164346ms step_avg:108.05ms | |
| step:1522/1770 train_time:164458ms step_avg:108.05ms | |
| step:1523/1770 train_time:164571ms step_avg:108.06ms | |
| step:1524/1770 train_time:164683ms step_avg:108.06ms | |
| step:1525/1770 train_time:164797ms step_avg:108.06ms | |
| step:1526/1770 train_time:164910ms step_avg:108.07ms | |
| step:1527/1770 train_time:165023ms step_avg:108.07ms | |
| step:1528/1770 train_time:165136ms step_avg:108.07ms | |
| step:1529/1770 train_time:165250ms step_avg:108.08ms | |
| step:1530/1770 train_time:165363ms step_avg:108.08ms | |
| step:1531/1770 train_time:165475ms step_avg:108.08ms | |
| step:1532/1770 train_time:165589ms step_avg:108.09ms | |
| step:1533/1770 train_time:165701ms step_avg:108.09ms | |
| step:1534/1770 train_time:165814ms step_avg:108.09ms | |
| step:1535/1770 train_time:165926ms step_avg:108.09ms | |
| step:1536/1770 train_time:166038ms step_avg:108.10ms | |
| step:1537/1770 train_time:166150ms step_avg:108.10ms | |
| step:1538/1770 train_time:166264ms step_avg:108.10ms | |
| step:1539/1770 train_time:166376ms step_avg:108.11ms | |
| step:1540/1770 train_time:166489ms step_avg:108.11ms | |
| step:1541/1770 train_time:166602ms step_avg:108.11ms | |
| step:1542/1770 train_time:166714ms step_avg:108.12ms | |
| step:1543/1770 train_time:166827ms step_avg:108.12ms | |
| step:1544/1770 train_time:166940ms step_avg:108.12ms | |
| step:1545/1770 train_time:167059ms step_avg:108.13ms | |
| step:1546/1770 train_time:167169ms step_avg:108.13ms | |
| step:1547/1770 train_time:167282ms step_avg:108.13ms | |
| step:1548/1770 train_time:167394ms step_avg:108.14ms | |
| step:1549/1770 train_time:167507ms step_avg:108.14ms | |
| step:1550/1770 train_time:167620ms step_avg:108.14ms | |
| step:1551/1770 train_time:167732ms step_avg:108.14ms | |
| step:1552/1770 train_time:167850ms step_avg:108.15ms | |
| step:1553/1770 train_time:167960ms step_avg:108.15ms | |
| step:1554/1770 train_time:168071ms step_avg:108.15ms | |
| step:1555/1770 train_time:168184ms step_avg:108.16ms | |
| step:1556/1770 train_time:168297ms step_avg:108.16ms | |
| step:1557/1770 train_time:168408ms step_avg:108.16ms | |
| step:1558/1770 train_time:168521ms step_avg:108.16ms | |
| step:1559/1770 train_time:168634ms step_avg:108.17ms | |
| step:1560/1770 train_time:168747ms step_avg:108.17ms | |
| step:1561/1770 train_time:168860ms step_avg:108.17ms | |
| step:1562/1770 train_time:168973ms step_avg:108.18ms | |
| step:1563/1770 train_time:169085ms step_avg:108.18ms | |
| step:1564/1770 train_time:169197ms step_avg:108.18ms | |
| step:1565/1770 train_time:169313ms step_avg:108.19ms | |
| step:1566/1770 train_time:169422ms step_avg:108.19ms | |
| step:1567/1770 train_time:169535ms step_avg:108.19ms | |
| step:1568/1770 train_time:169648ms step_avg:108.19ms | |
| step:1569/1770 train_time:169762ms step_avg:108.20ms | |
| step:1570/1770 train_time:169874ms step_avg:108.20ms | |
| step:1571/1770 train_time:169987ms step_avg:108.20ms | |
| step:1572/1770 train_time:170099ms step_avg:108.21ms | |
| step:1573/1770 train_time:170211ms step_avg:108.21ms | |
| step:1574/1770 train_time:170325ms step_avg:108.21ms | |
| step:1575/1770 train_time:170437ms step_avg:108.21ms | |
| step:1576/1770 train_time:170550ms step_avg:108.22ms | |
| step:1577/1770 train_time:170665ms step_avg:108.22ms | |
| step:1578/1770 train_time:170779ms step_avg:108.22ms | |
| step:1579/1770 train_time:170896ms step_avg:108.23ms | |
| step:1580/1770 train_time:171009ms step_avg:108.23ms | |
| step:1581/1770 train_time:171122ms step_avg:108.24ms | |
| step:1582/1770 train_time:171235ms step_avg:108.24ms | |
| step:1583/1770 train_time:171349ms step_avg:108.24ms | |
| step:1584/1770 train_time:171462ms step_avg:108.25ms | |
| step:1585/1770 train_time:171575ms step_avg:108.25ms | |
| step:1586/1770 train_time:171690ms step_avg:108.25ms | |
| step:1587/1770 train_time:171803ms step_avg:108.26ms | |
| step:1588/1770 train_time:171916ms step_avg:108.26ms | |
| step:1589/1770 train_time:172029ms step_avg:108.26ms | |
| step:1590/1770 train_time:172142ms step_avg:108.27ms | |
| step:1591/1770 train_time:172255ms step_avg:108.27ms | |
| step:1592/1770 train_time:172368ms step_avg:108.27ms | |
| step:1593/1770 train_time:172483ms step_avg:108.28ms | |
| step:1594/1770 train_time:172595ms step_avg:108.28ms | |
| step:1595/1770 train_time:172708ms step_avg:108.28ms | |
| step:1596/1770 train_time:172821ms step_avg:108.28ms | |
| step:1597/1770 train_time:172934ms step_avg:108.29ms | |
| step:1598/1770 train_time:173048ms step_avg:108.29ms | |
| step:1599/1770 train_time:173161ms step_avg:108.29ms | |
| step:1600/1770 train_time:173271ms step_avg:108.29ms | |
| step:1601/1770 train_time:173383ms step_avg:108.30ms | |
| step:1602/1770 train_time:173497ms step_avg:108.30ms | |
| step:1603/1770 train_time:173608ms step_avg:108.30ms | |
| step:1604/1770 train_time:173722ms step_avg:108.31ms | |
| step:1605/1770 train_time:173835ms step_avg:108.31ms | |
| step:1606/1770 train_time:173948ms step_avg:108.31ms | |
| step:1607/1770 train_time:174061ms step_avg:108.31ms | |
| step:1608/1770 train_time:174173ms step_avg:108.32ms | |
| step:1609/1770 train_time:174287ms step_avg:108.32ms | |
| step:1610/1770 train_time:174399ms step_avg:108.32ms | |
| step:1611/1770 train_time:174511ms step_avg:108.32ms | |
| step:1612/1770 train_time:174624ms step_avg:108.33ms | |
| step:1613/1770 train_time:174738ms step_avg:108.33ms | |
| step:1614/1770 train_time:174850ms step_avg:108.33ms | |
| step:1615/1770 train_time:174963ms step_avg:108.34ms | |
| step:1616/1770 train_time:175077ms step_avg:108.34ms | |
| step:1617/1770 train_time:175191ms step_avg:108.34ms | |
| step:1618/1770 train_time:175305ms step_avg:108.35ms | |
| step:1619/1770 train_time:175418ms step_avg:108.35ms | |
| step:1620/1770 train_time:175531ms step_avg:108.35ms | |
| step:1621/1770 train_time:175646ms step_avg:108.36ms | |
| step:1622/1770 train_time:175762ms step_avg:108.36ms | |
| step:1623/1770 train_time:175877ms step_avg:108.37ms | |
| step:1624/1770 train_time:175991ms step_avg:108.37ms | |
| step:1625/1770 train_time:176103ms step_avg:108.37ms | |
| step:1625/1770 val_loss:3.8145 train_time:176106ms step_avg:108.37ms | |
| step:1626/1770 train_time:176220ms step_avg:108.38ms | |
| step:1627/1770 train_time:176334ms step_avg:108.38ms | |
| step:1628/1770 train_time:176447ms step_avg:108.38ms | |
| step:1629/1770 train_time:176559ms step_avg:108.38ms | |
| step:1630/1770 train_time:176673ms step_avg:108.39ms | |
| step:1631/1770 train_time:176786ms step_avg:108.39ms | |
| step:1632/1770 train_time:176899ms step_avg:108.39ms | |
| step:1633/1770 train_time:177012ms step_avg:108.40ms | |
| step:1634/1770 train_time:177124ms step_avg:108.40ms | |
| step:1635/1770 train_time:177238ms step_avg:108.40ms | |
| step:1636/1770 train_time:177351ms step_avg:108.41ms | |
| step:1637/1770 train_time:177463ms step_avg:108.41ms | |
| step:1638/1770 train_time:177576ms step_avg:108.41ms | |
| step:1639/1770 train_time:177690ms step_avg:108.41ms | |
| step:1640/1770 train_time:177803ms step_avg:108.42ms | |
| step:1641/1770 train_time:177916ms step_avg:108.42ms | |
| step:1642/1770 train_time:178029ms step_avg:108.42ms | |
| step:1643/1770 train_time:178142ms step_avg:108.42ms | |
| step:1644/1770 train_time:178257ms step_avg:108.43ms | |
| step:1645/1770 train_time:178367ms step_avg:108.43ms | |
| step:1646/1770 train_time:178481ms step_avg:108.43ms | |
| step:1647/1770 train_time:178594ms step_avg:108.44ms | |
| step:1648/1770 train_time:178706ms step_avg:108.44ms | |
| step:1649/1770 train_time:178819ms step_avg:108.44ms | |
| step:1650/1770 train_time:178933ms step_avg:108.44ms | |
| step:1651/1770 train_time:179047ms step_avg:108.45ms | |
| step:1652/1770 train_time:179159ms step_avg:108.45ms | |
| step:1653/1770 train_time:179274ms step_avg:108.45ms | |
| step:1654/1770 train_time:179386ms step_avg:108.46ms | |
| step:1655/1770 train_time:179498ms step_avg:108.46ms | |
| step:1656/1770 train_time:179609ms step_avg:108.46ms | |
| step:1657/1770 train_time:179723ms step_avg:108.46ms | |
| step:1658/1770 train_time:179836ms step_avg:108.47ms | |
| step:1659/1770 train_time:179949ms step_avg:108.47ms | |
| step:1660/1770 train_time:180062ms step_avg:108.47ms | |
| step:1661/1770 train_time:180178ms step_avg:108.48ms | |
| step:1662/1770 train_time:180291ms step_avg:108.48ms | |
| step:1663/1770 train_time:180404ms step_avg:108.48ms | |
| step:1664/1770 train_time:180517ms step_avg:108.48ms | |
| step:1665/1770 train_time:180630ms step_avg:108.49ms | |
| step:1666/1770 train_time:180743ms step_avg:108.49ms | |
| step:1667/1770 train_time:180857ms step_avg:108.49ms | |
| step:1668/1770 train_time:180969ms step_avg:108.49ms | |
| step:1669/1770 train_time:181087ms step_avg:108.50ms | |
| step:1670/1770 train_time:181196ms step_avg:108.50ms | |
| step:1671/1770 train_time:181308ms step_avg:108.50ms | |
| step:1672/1770 train_time:181421ms step_avg:108.51ms | |
| step:1673/1770 train_time:181533ms step_avg:108.51ms | |
| step:1674/1770 train_time:181645ms step_avg:108.51ms | |
| step:1675/1770 train_time:181759ms step_avg:108.51ms | |
| step:1676/1770 train_time:181872ms step_avg:108.52ms | |
| step:1677/1770 train_time:181985ms step_avg:108.52ms | |
| step:1678/1770 train_time:182097ms step_avg:108.52ms | |
| step:1679/1770 train_time:182210ms step_avg:108.52ms | |
| step:1680/1770 train_time:182321ms step_avg:108.52ms | |
| step:1681/1770 train_time:182434ms step_avg:108.53ms | |
| step:1682/1770 train_time:182547ms step_avg:108.53ms | |
| step:1683/1770 train_time:182660ms step_avg:108.53ms | |
| step:1684/1770 train_time:182772ms step_avg:108.53ms | |
| step:1685/1770 train_time:182885ms step_avg:108.54ms | |
| step:1686/1770 train_time:182997ms step_avg:108.54ms | |
| step:1687/1770 train_time:183109ms step_avg:108.54ms | |
| step:1688/1770 train_time:183221ms step_avg:108.54ms | |
| step:1689/1770 train_time:183334ms step_avg:108.55ms | |
| step:1690/1770 train_time:183447ms step_avg:108.55ms | |
| step:1691/1770 train_time:183559ms step_avg:108.55ms | |
| step:1692/1770 train_time:183673ms step_avg:108.55ms | |
| step:1693/1770 train_time:183786ms step_avg:108.56ms | |
| step:1694/1770 train_time:183899ms step_avg:108.56ms | |
| step:1695/1770 train_time:184012ms step_avg:108.56ms | |
| step:1696/1770 train_time:184129ms step_avg:108.57ms | |
| step:1697/1770 train_time:184239ms step_avg:108.57ms | |
| step:1698/1770 train_time:184352ms step_avg:108.57ms | |
| step:1699/1770 train_time:184463ms step_avg:108.57ms | |
| step:1700/1770 train_time:184576ms step_avg:108.57ms | |
| step:1701/1770 train_time:184690ms step_avg:108.58ms | |
| step:1702/1770 train_time:184801ms step_avg:108.58ms | |
| step:1703/1770 train_time:184917ms step_avg:108.58ms | |
| step:1704/1770 train_time:185029ms step_avg:108.59ms | |
| step:1705/1770 train_time:185142ms step_avg:108.59ms | |
| step:1706/1770 train_time:185256ms step_avg:108.59ms | |
| step:1707/1770 train_time:185372ms step_avg:108.60ms | |
| step:1708/1770 train_time:185485ms step_avg:108.60ms | |
| step:1709/1770 train_time:185600ms step_avg:108.60ms | |
| step:1710/1770 train_time:185713ms step_avg:108.60ms | |
| step:1711/1770 train_time:185827ms step_avg:108.61ms | |
| step:1712/1770 train_time:185939ms step_avg:108.61ms | |
| step:1713/1770 train_time:186054ms step_avg:108.61ms | |
| step:1714/1770 train_time:186168ms step_avg:108.62ms | |
| step:1715/1770 train_time:186282ms step_avg:108.62ms | |
| step:1716/1770 train_time:186398ms step_avg:108.62ms | |
| step:1717/1770 train_time:186513ms step_avg:108.63ms | |
| step:1718/1770 train_time:186627ms step_avg:108.63ms | |
| step:1719/1770 train_time:186741ms step_avg:108.63ms | |
| step:1720/1770 train_time:186857ms step_avg:108.64ms | |
| step:1721/1770 train_time:186967ms step_avg:108.64ms | |
| step:1722/1770 train_time:187079ms step_avg:108.64ms | |
| step:1723/1770 train_time:187193ms step_avg:108.64ms | |
| step:1724/1770 train_time:187308ms step_avg:108.65ms | |
| step:1725/1770 train_time:187421ms step_avg:108.65ms | |
| step:1726/1770 train_time:187535ms step_avg:108.65ms | |
| step:1727/1770 train_time:187649ms step_avg:108.66ms | |
| step:1728/1770 train_time:187762ms step_avg:108.66ms | |
| step:1729/1770 train_time:187875ms step_avg:108.66ms | |
| step:1730/1770 train_time:187988ms step_avg:108.66ms | |
| step:1731/1770 train_time:188102ms step_avg:108.67ms | |
| step:1732/1770 train_time:188216ms step_avg:108.67ms | |
| step:1733/1770 train_time:188329ms step_avg:108.67ms | |
| step:1734/1770 train_time:188441ms step_avg:108.67ms | |
| step:1735/1770 train_time:188557ms step_avg:108.68ms | |
| step:1736/1770 train_time:188671ms step_avg:108.68ms | |
| step:1737/1770 train_time:188785ms step_avg:108.68ms | |
| step:1738/1770 train_time:188898ms step_avg:108.69ms | |
| step:1739/1770 train_time:189011ms step_avg:108.69ms | |
| step:1740/1770 train_time:189126ms step_avg:108.69ms | |
| step:1741/1770 train_time:189239ms step_avg:108.70ms | |
| step:1742/1770 train_time:189354ms step_avg:108.70ms | |
| step:1743/1770 train_time:189469ms step_avg:108.70ms | |
| step:1744/1770 train_time:189580ms step_avg:108.70ms | |
| step:1745/1770 train_time:189694ms step_avg:108.71ms | |
| step:1746/1770 train_time:189809ms step_avg:108.71ms | |
| step:1747/1770 train_time:189922ms step_avg:108.71ms | |
| step:1748/1770 train_time:190036ms step_avg:108.72ms | |
| step:1749/1770 train_time:190150ms step_avg:108.72ms | |
| step:1750/1770 train_time:190264ms step_avg:108.72ms | |
| step:1750/1770 val_loss:3.7810 train_time:190268ms step_avg:108.72ms | |
| step:1751/1770 train_time:190390ms step_avg:108.73ms | |
| step:1752/1770 train_time:190506ms step_avg:108.74ms | |
| step:1753/1770 train_time:190619ms step_avg:108.74ms | |
| step:1754/1770 train_time:190732ms step_avg:108.74ms | |
| step:1755/1770 train_time:190847ms step_avg:108.74ms | |
| step:1756/1770 train_time:190960ms step_avg:108.75ms | |
| step:1757/1770 train_time:191078ms step_avg:108.75ms | |
| step:1758/1770 train_time:191188ms step_avg:108.75ms | |
| step:1759/1770 train_time:191301ms step_avg:108.76ms | |
| step:1760/1770 train_time:191414ms step_avg:108.76ms | |
| step:1761/1770 train_time:191528ms step_avg:108.76ms | |
| step:1762/1770 train_time:191643ms step_avg:108.76ms | |
| step:1763/1770 train_time:191756ms step_avg:108.77ms | |
| step:1764/1770 train_time:191870ms step_avg:108.77ms | |
| step:1765/1770 train_time:191988ms step_avg:108.77ms | |
| step:1766/1770 train_time:192101ms step_avg:108.78ms | |
| step:1767/1770 train_time:192214ms step_avg:108.78ms | |
| step:1768/1770 train_time:192328ms step_avg:108.78ms | |
| step:1769/1770 train_time:192443ms step_avg:108.79ms | |
| step:1770/1770 train_time:192557ms step_avg:108.79ms | |
| step:1770/1770 val_loss:3.7770 train_time:192561ms step_avg:108.79ms | |
| peak memory allocated: 31310 MiB reserved: 45252 MiB |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import sys | |
| with open(sys.argv[0]) as f: | |
| code = f.read() # read the code of this file ASAP, for logging | |
| import uuid | |
| import time | |
| import copy | |
| import glob | |
| from dataclasses import dataclass | |
| from functools import lru_cache, partial # Added partial for hook registration | |
| from pathlib import Path | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| import torch | |
| torch.empty(1, device="cuda", requires_grad=True).backward() # prevents a bug on some systems | |
| from torch import Tensor, nn | |
| import torch.nn.functional as F | |
| import torch.distributed as dist | |
| # use of FlexAttention contributed by @KoszarskyB | |
| from torch.nn.attention.flex_attention import BlockMask, flex_attention | |
| #torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min | |
| # ----------------------------------------------------------------------------- | |
| # Custom operators: FP8 matmul by @YouJiacheng | |
| @torch.library.custom_op("nanogpt::mm", mutates_args=()) | |
| def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: | |
| @torch.compile | |
| def impl(x: Tensor, w: Tensor): | |
| assert x.is_contiguous() and w.is_contiguous() | |
| x_f8 = x.div(x_s).to(torch.float8_e4m3fn) | |
| w_f8 = w.div(w_s).to(torch.float8_e4m3fn) | |
| out = torch._scaled_mm( | |
| x_f8, | |
| w_f8.T, | |
| out_dtype=torch.bfloat16, | |
| scale_a=x.new_tensor(x_s, dtype=torch.float32), | |
| scale_b=x.new_tensor(w_s, dtype=torch.float32), | |
| use_fast_accum=True, | |
| ) | |
| return out, x_f8, w_f8 | |
| return impl(x, w) | |
| @mm_op.register_fake | |
| def _(x: Tensor, w: Tensor, *_): | |
| assert x.ndim == w.ndim == 2 | |
| assert x.shape[1] == w.shape[1] | |
| assert x.device == w.device | |
| assert x.is_contiguous() and w.is_contiguous() | |
| return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) | |
| @torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) | |
| def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: | |
| @torch.compile | |
| def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): | |
| assert grad.is_contiguous() | |
| x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) | |
| w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) | |
| grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) | |
| grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) | |
| grad_x = torch._scaled_mm( | |
| grad_f8, | |
| w_f8.T.contiguous().T, | |
| out_dtype=torch.bfloat16, | |
| scale_a=grad_inv_s, | |
| scale_b=w_inv_s, | |
| use_fast_accum=False, | |
| ) | |
| # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) | |
| grad_w = torch._scaled_mm( | |
| x_f8.T.contiguous(), | |
| grad_f8.T.contiguous().T, | |
| out_dtype=torch.float32, | |
| scale_a=x_inv_s, | |
| scale_b=grad_inv_s, | |
| use_fast_accum=False, | |
| ).T | |
| return grad_x, grad_w | |
| return impl(g, x_f8, w_f8) | |
| @mm_backward_op.register_fake | |
| def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): | |
| return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) | |
| def backward(ctx, grad_out: Tensor, *_): | |
| x_f8, w_f8 = ctx.saved_tensors | |
| x_s, w_s, grad_s = ctx.scales | |
| grad_x, grad_w = torch.ops.nanogpt.mm_backward( | |
| grad_out, x_f8, w_f8, x_s, w_s, grad_s | |
| ) | |
| return grad_x, grad_w, None, None, None | |
| def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): | |
| *_, x_s, w_s, grad_s = inputs | |
| _, x_f8, w_f8 = output | |
| ctx.save_for_backward(x_f8, w_f8) | |
| ctx.scales = x_s, w_s, grad_s | |
| ctx.set_materialize_grads(False) | |
| mm_op.register_autograd(backward, setup_context=setup_context) | |
| # ----------------------------------------------------------------------------- | |
| # Muon optimizer | |
| @torch.compile | |
| def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: | |
| """ | |
| Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a | |
| quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose | |
| of minimizing steps, it turns out to be empirically effective to keep increasing the slope at | |
| zero even beyond the point where the iteration no longer converges all the way to one everywhere | |
| on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T | |
| where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model | |
| performance at all relative to UV^T, where USV^T = G is the SVD. | |
| """ | |
| assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng | |
| a, b, c = (3.4445, -4.7750, 2.0315) | |
| X = G.bfloat16() | |
| if G.size(-2) > G.size(-1): | |
| X = X.mT | |
| # Ensure spectral norm is at most 1 | |
| X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7) | |
| # Perform the NS iterations | |
| for _ in range(steps): | |
| A = X @ X.mT | |
| B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng | |
| X = a * X + B @ X | |
| if G.size(-2) > G.size(-1): | |
| X = X.mT | |
| return X | |
| class Muon(torch.optim.Optimizer): | |
| """ | |
| Muon - MomentUm Orthogonalized by Newton-schulz | |
| https://kellerjordan.github.io/posts/muon/ | |
| Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- | |
| processing step, in which each 2D parameter's update is replaced with the nearest orthogonal | |
| matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has | |
| the advantage that it can be stably run in bfloat16 on the GPU. | |
| Some warnings: | |
| - This optimizer should not be used for the embedding layer, the final fully connected layer, | |
| or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). | |
| - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. | |
| Arguments: | |
| lr: The learning rate used by the internal SGD. | |
| momentum: The momentum used by the internal SGD. | |
| nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) | |
| ns_steps: The number of Newton-Schulz iteration steps to use. | |
| """ | |
| def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5, rank=0, world_size=1): | |
| self.rank = rank | |
| self.world_size = world_size | |
| defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) | |
| params: list[Tensor] = [*params] | |
| param_groups = [] | |
| for size in {p.numel() for p in params}: | |
| b = torch.empty(world_size, size, dtype=torch.bfloat16, device="cuda") | |
| group = dict(params=[p for p in params if p.numel() == size], | |
| update_buffer=b, update_buffer_views=[b[i] for i in range(world_size)]) | |
| param_groups.append(group) | |
| super().__init__(param_groups, defaults) | |
| @torch.no_grad() | |
| def step(self): | |
| for group in self.param_groups: | |
| update_buffer: Tensor = group["update_buffer"] | |
| update_buffer_views: list[Tensor] = group["update_buffer_views"] | |
| # generate weight updates in distributed fashion | |
| params: list[Tensor] = group["params"] | |
| handle = None | |
| params_world = None | |
| def update_prev(): # optimized Muon implementation contributed by @YouJiacheng | |
| handle.wait() | |
| for p_world, g_world in zip(params_world, update_buffer_views): | |
| p_world.add_(g_world.view_as(p_world), | |
| alpha=-group["lr"] * max(1, p_world.size(-2) / p_world.size(-1))**0.5) | |
| for base_i in range(len(params))[::self.world_size]: | |
| if base_i + self.rank < len(params): | |
| p = params[base_i + self.rank] | |
| g = p.grad | |
| assert g is not None | |
| state = self.state[p] | |
| if "momentum_buffer" not in state: | |
| state["momentum_buffer"] = torch.zeros_like(g) | |
| buf: Tensor = state["momentum_buffer"] | |
| buf.lerp_(g, 1 - group["momentum"]) | |
| g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf | |
| g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"]).flatten() | |
| else: | |
| g = update_buffer_views[self.rank] | |
| if base_i > 0: | |
| update_prev() # async all_gather instead of sync all_reduce by @YouJiacheng | |
| handle = dist.all_gather_into_tensor(update_buffer, g, async_op=True) | |
| params_world = params[base_i : base_i + self.world_size] | |
| update_prev() | |
| # ----------------------------------------------------------------------------- | |
| # PyTorch nn.Module definitions for the model | |
| def norm(x: Tensor): | |
| return F.rms_norm(x, (x.size(-1),)) | |
| class CastedLinear(nn.Linear): | |
| def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): | |
| super().__init__(in_features, out_features, bias=False) | |
| self.use_fp8 = use_fp8 | |
| self.x_s = x_s | |
| self.w_s = w_s | |
| self.grad_s = grad_s | |
| def reset_parameters(self) -> None: | |
| std = 0.5 * (self.in_features ** -0.5) # 0.5 is a bit better than the default 1/sqrt(3) | |
| bound = (3 ** 0.5) * std | |
| with torch.no_grad(): | |
| self.weight.uniform_(-bound, bound) | |
| def forward(self, x: Tensor): | |
| if self.use_fp8 and self.training: | |
| _x = x.flatten(0, -2) | |
| out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] | |
| return out.reshape(*x.shape[:-1], -1) | |
| else: | |
| return F.linear(x, self.weight.type_as(x)) | |
| class Rotary(nn.Module): | |
| def __init__(self, dim: int, max_seq_len: int): | |
| super().__init__() | |
| # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) | |
| angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim//4, dtype=torch.float32) | |
| angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim//4)]) | |
| t = torch.arange(max_seq_len, dtype=torch.float32) | |
| theta = torch.einsum("i,j -> ij", t, angular_freq) | |
| self.cos = nn.Buffer(theta.cos(), persistent=False) | |
| self.sin = nn.Buffer(theta.sin(), persistent=False) | |
| def forward(self, x_BTHD: Tensor): | |
| assert self.cos.size(0) >= x_BTHD.size(-3) | |
| cos, sin = self.cos[None, :x_BTHD.size(-3), None, :], self.sin[None, :x_BTHD.size(-3), None, :] | |
| x1, x2 = x_BTHD.to(dtype=torch.float32).chunk(2, dim=-1) | |
| y1 = x1 * cos + x2 * sin | |
| y2 = x1 * (-sin) + x2 * cos | |
| return torch.cat((y1, y2), 3).type_as(x_BTHD) | |
| class CausalSelfAttention(nn.Module): | |
| def __init__(self, dim: int, num_heads: int, max_seq_len: int, head_dim=128): | |
| super().__init__() | |
| self.num_heads = num_heads | |
| self.head_dim = head_dim | |
| hdim = num_heads * head_dim | |
| std = 0.5 * (dim ** -0.5) | |
| bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng | |
| # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng | |
| # https://x.com/hi_tysam/status/1879699187107033311 | |
| self.qkv_w = nn.Parameter(torch.empty(3, hdim, dim).uniform_(-bound, bound)) | |
| self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) | |
| self.rotary = Rotary(head_dim, max_seq_len) | |
| self.c_proj = CastedLinear(hdim, dim) | |
| self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
| def forward(self, x: Tensor, ve: Tensor | None, block_mask: BlockMask): | |
| B, T = x.size(0), x.size(1) # batch size, sequence length | |
| assert B == 1, "Must use batch size = 1 for FlexAttention" | |
| q, k, v = F.linear(x, self.qkv_w.flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) | |
| q, k = norm(q), norm(k) # QK norm @Grad62304977 | |
| q, k = self.rotary(q), self.rotary(k) | |
| if ve is not None: | |
| v = self.lambdas[0] * v + self.lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977 | |
| else: # skip mid-layers token value embeddings by @YouJiacheng | |
| v = self.lambdas[0] * v | |
| # scale the attention logits by given constant, instead of the default head_dim**-0.5, by @leloykun | |
| # inspired by learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 | |
| y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, scale=0.12).transpose(1, 2) | |
| y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side | |
| y = self.c_proj(y) | |
| return y | |
| class MLP(nn.Module): | |
| def __init__(self, dim: int): | |
| super().__init__() | |
| hdim = 4 * dim | |
| self.c_fc = CastedLinear(dim, hdim) | |
| self.c_proj = CastedLinear(hdim, dim) | |
| self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
| def forward(self, x: Tensor): | |
| x = self.c_fc(x) | |
| x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 | |
| x = self.c_proj(x) | |
| return x | |
| class Block(nn.Module): | |
| def __init__(self, dim: int, num_heads: int, max_seq_len: int, layer_idx: int): | |
| super().__init__() | |
| # skip attention of blocks.7 (the 8th layer) by @YouJiacheng | |
| self.attn = CausalSelfAttention(dim, num_heads, max_seq_len) if layer_idx != 7 else None | |
| self.mlp = MLP(dim) | |
| self.lambdas = nn.Parameter(torch.tensor([1., 0.])) | |
| def forward(self, x: Tensor, ve: Tensor | None, x0: Tensor, block_mask: BlockMask): | |
| x = self.lambdas[0] * x + self.lambdas[1] * x0 | |
| if self.attn is not None: | |
| x = x + self.attn(norm(x), ve, block_mask) | |
| x = x + self.mlp(norm(x)) | |
| return x | |
| # ----------------------------------------------------------------------------- | |
| # The main model | |
| def next_multiple_of_n(v: float | int, *, n: int): | |
| return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) | |
| class GPT(nn.Module): | |
| def __init__(self, vocab_size: int, num_layers: int, num_heads: int, model_dim: int, max_seq_len: int): | |
| super().__init__() | |
| self.embed = nn.Embedding(vocab_size, model_dim) | |
| # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 | |
| # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 | |
| self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) | |
| self.blocks = nn.ModuleList([Block(model_dim, num_heads, max_seq_len, i) for i in range(num_layers)]) | |
| # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. | |
| # suggested to me by @Grad62304977. this originates from Karpathy's experiments. | |
| self.lm_head = CastedLinear(model_dim, next_multiple_of_n(vocab_size, n=128), | |
| use_fp8=True, x_s=(model_dim**0.5)/448, w_s=24/448, grad_s=1/448) | |
| self.lm_head.weight.detach().zero_() # @Grad62304977 | |
| # Add learnable skip connection weights for decoder layers | |
| assert num_layers % 2 == 0 | |
| self.skip_weights = nn.Parameter(torch.ones(num_layers//2)) | |
| def create_blockmasks(self, input_seq: Tensor, sliding_window_num_blocks: Tensor): | |
| BLOCK_SIZE = 128 | |
| docs = (input_seq == 50256).cumsum(0) | |
| def document_causal(b, h, q_idx, kv_idx): | |
| causal_mask = q_idx >= kv_idx | |
| document_mask = docs[q_idx] == docs[kv_idx] | |
| return causal_mask & document_mask | |
| def dense_to_ordered(dense_blockmask: Tensor): | |
| num_blocks = dense_blockmask.sum(dim=-1, dtype=torch.int32) | |
| indices = dense_blockmask.argsort(dim=-1, descending=False, stable=True).flip(-1).to(torch.int32) | |
| return num_blocks[None, None].contiguous(), indices[None, None].contiguous() | |
| # manual block mask creation by @YouJiacheng | |
| assert len(input_seq) % BLOCK_SIZE == 0 | |
| NUM_BLOCKS = len(input_seq) // BLOCK_SIZE | |
| block_idx = torch.arange(NUM_BLOCKS, dtype=torch.int32, device="cuda") | |
| causal_blockmask_any = block_idx[:, None] >= block_idx | |
| causal_blockmask_all = block_idx[:, None] > block_idx | |
| docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() | |
| docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() | |
| document_blockmask_any = (docs_low[:, None] <= docs_high) & (docs_high[:, None] >= docs_low) | |
| document_blockmask_all = (docs_low[:, None] == docs_high) & (docs_high[:, None] == docs_low) | |
| blockmask_any = causal_blockmask_any & document_blockmask_any | |
| blockmask_all = causal_blockmask_all & document_blockmask_all | |
| partial_kv_num_blocks, partial_kv_indices = dense_to_ordered(blockmask_any & ~blockmask_all) | |
| full_kv_num_blocks, full_kv_indices = dense_to_ordered(blockmask_all) | |
| def build_bm(window_size_blocks: Tensor) -> BlockMask: | |
| return BlockMask.from_kv_blocks( | |
| torch.clamp_max(partial_kv_num_blocks, torch.clamp_min(window_size_blocks - full_kv_num_blocks, 1)), | |
| partial_kv_indices, | |
| torch.clamp_max(full_kv_num_blocks, window_size_blocks - 1), | |
| full_kv_indices, | |
| BLOCK_SIZE=BLOCK_SIZE, | |
| mask_mod=document_causal, | |
| ) | |
| # Long-short SWA block masks by @leloykun & @YouJiacheng, adapated from suggestion by @Grad62304977, following Gemma 2 paper | |
| return build_bm(sliding_window_num_blocks), build_bm(sliding_window_num_blocks // 2) | |
| def forward(self, input_seq: Tensor, target_seq: Tensor, sliding_window_num_blocks: Tensor): | |
| assert input_seq.ndim == 1 | |
| ve = [value_embed(input_seq) for value_embed in self.value_embeds] | |
| # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure | |
| ve = [ve[0], ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] | |
| assert len(ve) == len(self.blocks) | |
| long_bm, short_bm = self.create_blockmasks(input_seq, sliding_window_num_blocks) | |
| block_masks = [long_bm, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, long_bm, short_bm, short_bm, short_bm, long_bm] | |
| assert len(block_masks) == len(self.blocks) | |
| x = x0 = norm(self.embed(input_seq)[None]) # use of norm here by @Grad62304977 | |
| # U-net design by @brendanh0gan | |
| skip_connections = [] | |
| n = len(self.skip_weights) | |
| for i in range(len(self.blocks)): | |
| if i >= n: | |
| x = x + self.skip_weights[i - n] * skip_connections.pop() | |
| x = self.blocks[i](x, ve[i], x0, block_masks[i]) | |
| if i < n: | |
| skip_connections.append(x) | |
| x = norm(x) | |
| logits = self.lm_head(x).float() | |
| # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) | |
| logits = 30 * torch.sigmoid(logits / (7.5 * x.size(-1)**0.5)) | |
| loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_seq, reduction='sum' if self.training else 'mean') | |
| return loss | |
| # ----------------------------------------------------------------------------- | |
| # Our own simple Distributed Data Loader | |
| def _load_data_shard(file: Path): | |
| header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 | |
| assert header[0] == 20240520, "magic number mismatch in the data .bin file" | |
| assert header[1] == 1, "unsupported version" | |
| num_tokens = int(header[2]) # number of tokens (claimed) | |
| with file.open("rb", buffering=0) as f: | |
| tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng | |
| f.seek(256 * 4) | |
| nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng | |
| assert nbytes == 2 * num_tokens, "number of tokens read does not match header" | |
| return tokens | |
| def distributed_data_generator(filename_pattern: str, batch_size: int, rank : int, world_size : int): | |
| files = [Path(file) for file in sorted(glob.glob(filename_pattern))] | |
| assert batch_size % world_size == 0 | |
| local_batch_size = batch_size // world_size | |
| file_iter = iter(files) # use itertools.cycle(files) instead if you want to do multi-epoch training | |
| tokens, pos = _load_data_shard(next(file_iter)), 0 | |
| while True: | |
| if pos + batch_size + 1 >= len(tokens): | |
| tokens, pos = _load_data_shard(next(file_iter)), 0 | |
| buf = tokens[pos + rank * local_batch_size:][:local_batch_size + 1] | |
| inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # no sync on host side; | |
| targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # H2D in another stream isn't helpful. | |
| pos += batch_size | |
| yield inputs, targets | |
| # ----------------------------------------------------------------------------- | |
| # int main | |
| @dataclass | |
| class Hyperparameters: | |
| # data | |
| train_files = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on | |
| val_files = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on | |
| val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons | |
| train_seq_len = 48*1024 # FlexAttention sequence length | |
| val_seq_len = 4*64*1024 # FlexAttention sequence length for validation | |
| # optimization | |
| num_iterations = 1770 # number of iterations to run | |
| cooldown_frac = 0.4 # fraction of training spent cooling down the learning rate | |
| # architecture | |
| vocab_size = 50257 | |
| # evaluation and logging | |
| val_loss_every = 125 # every how many steps to evaluate val loss? 0 for only at the end | |
| save_checkpoint = False | |
| args = Hyperparameters() | |
| # torchrun sets these env variables | |
| rank = int(os.environ["RANK"]) | |
| world_size = int(os.environ["WORLD_SIZE"]) | |
| assert world_size == 1 # this code is designed for 8xH100 | |
| assert torch.cuda.is_available() | |
| device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) | |
| torch.cuda.set_device(device) | |
| dist.init_process_group(backend="nccl", device_id=device) | |
| dist.barrier() | |
| master_process = (rank == 0) # this process will do logging, checkpointing etc. | |
| # begin logging | |
| logfile = None | |
| if master_process: | |
| run_id = uuid.uuid4() | |
| os.makedirs("logs", exist_ok=True) | |
| logfile = f"logs/{run_id}.txt" | |
| print(logfile) | |
| def print0(s, console=False): | |
| if master_process: | |
| with open(logfile, "a") as f: | |
| if console: | |
| print(s) | |
| print(s, file=f) | |
| # begin by printing this file (the Python code) | |
| print0(code) | |
| print0("="*100) | |
| # log information about the hardware/software environment this is running on | |
| print0(f"Running Python {sys.version}") | |
| print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") | |
| def nvidia_smi(): | |
| import subprocess # avoid top level import | |
| return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout | |
| print0(nvidia_smi()) | |
| print0("="*100) | |
| ######################################## | |
| # Construct model and optimizer # | |
| ######################################## | |
| model: nn.Module = GPT(vocab_size=args.vocab_size, num_layers=12, num_heads=6, model_dim=768, | |
| max_seq_len=max(args.train_seq_len, args.val_seq_len)).cuda() | |
| for m in model.modules(): | |
| if isinstance(m, nn.Embedding): | |
| m.bfloat16() | |
| for param in model.parameters(): | |
| dist.broadcast(param.detach(), 0) | |
| # collect the parameters to optimize | |
| hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n] | |
| embed_params = [p for n, p in model.named_parameters() if "embed" in n] | |
| scalar_params = [p for p in model.parameters() if p.ndim < 2] | |
| head_params = [model.lm_head.weight] | |
| # init the optimizer(s) | |
| adam_params = [dict(params=head_params, lr=0.22), dict(params=embed_params, lr=0.6), dict(params=scalar_params, lr=0.04)] | |
| # small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence | |
| # discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 | |
| optimizer1 = torch.optim.Adam(adam_params, betas=(0.8, 0.95), eps=1e-10, fused=True) | |
| optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95, rank=rank, world_size=world_size) | |
| optimizers = [optimizer1, optimizer2] | |
| for opt in optimizers: | |
| for group in opt.param_groups: | |
| group["initial_lr"] = group["lr"] | |
| # learning rate schedule: stable then decay | |
| def get_lr(step: int): | |
| x = step / args.num_iterations # progress in training | |
| assert 0 <= x < 1 | |
| if x < 1 - args.cooldown_frac: | |
| return 1.0 | |
| else: | |
| w = (1 - x) / args.cooldown_frac | |
| return w * 1.0 + (1 - w) * 0.1 | |
| # attention window size schedule: linearly increase | |
| @lru_cache(1) | |
| def get_window_size_blocks_helper(window_size: int): | |
| return torch.tensor(window_size // 128, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True) | |
| def get_window_size_blocks(step: int): | |
| x = step / args.num_iterations # progress in training | |
| assert 0 <= x <= 1 | |
| # Linearly increase the block-wise sliding window size over training 128 -> 1792 | |
| # increase by @fernbear.bsky.social; block-wise by @YouJiacheng | |
| window_size = next_multiple_of_n(1728 * x, n=128) | |
| return get_window_size_blocks_helper(window_size) | |
| model: nn.Module = torch.compile(model, dynamic=False) | |
| ######################################## | |
| # Warmup kernels # | |
| ######################################## | |
| # Warmup the training kernels, then re-initialize the state so we aren't cheating | |
| warmup_steps = 10 | |
| initial_state = dict(model=copy.deepcopy(model.state_dict()), | |
| optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state | |
| for _ in range(warmup_steps): | |
| inputs = targets = torch.randint(0, args.vocab_size, size=(args.train_seq_len,), device="cuda") | |
| model(inputs.to(torch.int32), targets, get_window_size_blocks(0)).backward() | |
| for opt in optimizers: | |
| opt.step() | |
| model.zero_grad(set_to_none=True) | |
| model.load_state_dict(initial_state["model"]) | |
| for opt, opt_state in zip(optimizers, initial_state["optimizers"]): | |
| opt.load_state_dict(opt_state) | |
| del initial_state | |
| ######################################## | |
| # Overlap Communication Setup # | |
| ######################################## | |
| # Create parameter buckets for better overlap | |
| def create_buckets(params, bucket_size_mb=25): | |
| """Group parameters into buckets of approximately bucket_size_mb MB each""" | |
| buckets = [] | |
| current_bucket = [] | |
| current_size = 0 | |
| # Sort parameters by size (largest first) for better bucketing | |
| sorted_params = sorted(params, key=lambda p: p.numel(), reverse=True) | |
| for param in sorted_params: | |
| param_size_mb = param.numel() * param.element_size() / (1024 * 1024) | |
| if current_size + param_size_mb > bucket_size_mb and current_bucket: | |
| buckets.append(current_bucket) | |
| current_bucket = [param] | |
| current_size = param_size_mb | |
| else: | |
| current_bucket.append(param) | |
| current_size += param_size_mb | |
| if current_bucket: | |
| buckets.append(current_bucket) | |
| return buckets | |
| # Create buckets for all parameters | |
| all_params = [p for p in model.parameters() if p.requires_grad] | |
| param_buckets = create_buckets(all_params) | |
| print0(f"Created {len(param_buckets)} gradient buckets") | |
| for i, bucket in enumerate(param_buckets): | |
| total_size = sum(p.numel() * p.element_size() for p in bucket) / (1024 * 1024) | |
| print0(f"Bucket {i}: {len(bucket)} params, {total_size:.1f} MB") | |
| # Bucket state tracking | |
| bucket_ready_count = [0] * len(param_buckets) | |
| bucket_handles = [None] * len(param_buckets) | |
| param_to_bucket = {} | |
| # Map each parameter to its bucket index | |
| for bucket_idx, bucket in enumerate(param_buckets): | |
| for param in bucket: | |
| param_to_bucket[param] = bucket_idx | |
| def _gradient_hook(param: Tensor): | |
| """Called when a parameter's gradient is ready""" | |
| if param.grad is None: | |
| return | |
| bucket_idx = param_to_bucket[param] | |
| bucket_ready_count[bucket_idx] += 1 | |
| # Check if all parameters in this bucket are ready | |
| if bucket_ready_count[bucket_idx] == len(param_buckets[bucket_idx]): | |
| # All-reduce this bucket | |
| bucket_grads = [p.grad for p in param_buckets[bucket_idx]] | |
| # For multi-tensor operations, we can reduce them together | |
| if len(bucket_grads) == 1: | |
| handle = dist.all_reduce(bucket_grads[0], op=dist.ReduceOp.AVG, async_op=True) | |
| else: | |
| # Use multi-tensor all-reduce for efficiency | |
| handle = dist.all_reduce_coalesced(bucket_grads, op=dist.ReduceOp.AVG, async_op=True) | |
| bucket_handles[bucket_idx] = handle | |
| # Register hooks for all parameters | |
| print0("Registering bucketed gradient hooks...") | |
| for param in all_params: | |
| param.register_post_accumulate_grad_hook(_gradient_hook) | |
| def wait_for_gradients(): | |
| """Wait for all gradient reductions to complete and reset bucket state""" | |
| for handle in bucket_handles: | |
| if handle is not None: | |
| handle.wait() | |
| # Reset state for next iteration | |
| for i in range(len(bucket_ready_count)): | |
| bucket_ready_count[i] = 0 | |
| bucket_handles[i] = None | |
| ######################################## | |
| # Training and validation # | |
| ######################################## | |
| train_loader = distributed_data_generator(args.train_files, world_size * args.train_seq_len, rank, world_size) | |
| training_time_ms = 0 | |
| # start the clock | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| # begin training | |
| train_steps = args.num_iterations | |
| for step in range(train_steps + 1): | |
| last_step = (step == train_steps) | |
| # --------------- VALIDATION SECTION ----------------- | |
| if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): | |
| # stop the clock | |
| torch.cuda.synchronize() | |
| training_time_ms += 1000 * (time.perf_counter() - t0) | |
| model.eval() | |
| val_batch_size = world_size * args.val_seq_len | |
| assert args.val_tokens % val_batch_size == 0 | |
| val_steps = args.val_tokens // val_batch_size | |
| val_loader = distributed_data_generator(args.val_files, val_batch_size, rank, world_size) | |
| val_loss = 0 | |
| with torch.no_grad(): | |
| for _ in range(val_steps): | |
| inputs, targets = next(val_loader) | |
| val_loss += model(inputs, targets, get_window_size_blocks(step)) | |
| val_loss /= val_steps | |
| del val_loader | |
| dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) | |
| print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) | |
| model.train() | |
| # start the clock again | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| if last_step: | |
| if master_process and args.save_checkpoint: | |
| log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) | |
| os.makedirs(f"logs/{run_id}", exist_ok=True) | |
| torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") | |
| # the last step only has the validation loop, so break to avoid training | |
| break | |
| # --------------- TRAINING SECTION ----------------- | |
| inputs, targets = next(train_loader) | |
| model(inputs, targets, get_window_size_blocks(step)).backward() | |
| #for param in model.parameters(): | |
| # dist.all_reduce(param.grad, op=dist.ReduceOp.AVG) | |
| wait_for_gradients() # does the same thing as commented two lines above, but faster | |
| # set optimization hyperparameters | |
| for opt in optimizers: | |
| for group in opt.param_groups: | |
| group["lr"] = group["initial_lr"] * get_lr(step) | |
| for group in optimizer2.param_groups: | |
| frac = min(step / 300, 1) # momentum warmup for muon | |
| group["momentum"] = (1 - frac) * 0.85 + frac * 0.95 | |
| # step the optimizers | |
| for opt in optimizers: | |
| opt.step() | |
| # null the gradients | |
| model.zero_grad(set_to_none=True) | |
| # logging | |
| approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) | |
| print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) | |
| print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " | |
| f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) | |
| dist.destroy_process_group() | |
| ==================================================================================================== | |
| Running Python 3.12.3 (main, Feb 4 2025, 14:48:35) [GCC 13.3.0] | |
| Running PyTorch 2.7.1+cu126 compiled for CUDA 12.6 | |
| Wed Jun 25 21:10:13 2025 | |
| +-----------------------------------------------------------------------------------------+ | |
| | NVIDIA-SMI 560.35.03 Driver Version: 560.35.03 CUDA Version: 12.6 | | |
| |-----------------------------------------+------------------------+----------------------+ | |
| | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | | MIG M. | | |
| |=========================================+========================+======================| | |
| | 0 NVIDIA H100 80GB HBM3 On | 00000000:C6:00.0 Off | 0 | | |
| | N/A 46C P0 153W / 700W | 1184MiB / 81559MiB | 1% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| +-----------------------------------------------------------------------------------------+ | |
| | Processes: | | |
| | GPU GI CI PID Type Process name GPU Memory | | |
| | ID ID Usage | | |
| |=========================================================================================| | |
| +-----------------------------------------------------------------------------------------+ | |
| ==================================================================================================== | |
| Created 22 gradient buckets | |
| Bucket 0: 1 params, 147.4 MB | |
| Bucket 1: 1 params, 73.6 MB | |
| Bucket 2: 1 params, 73.6 MB | |
| Bucket 3: 1 params, 73.6 MB | |
| Bucket 4: 1 params, 73.6 MB | |
| Bucket 5: 2 params, 18.0 MB | |
| Bucket 6: 2 params, 18.0 MB | |
| Bucket 7: 2 params, 18.0 MB | |
| Bucket 8: 2 params, 18.0 MB | |
| Bucket 9: 2 params, 18.0 MB | |
| Bucket 10: 2 params, 18.0 MB | |
| Bucket 11: 2 params, 18.0 MB | |
| Bucket 12: 2 params, 18.0 MB | |
| Bucket 13: 2 params, 18.0 MB | |
| Bucket 14: 2 params, 18.0 MB | |
| Bucket 15: 2 params, 18.0 MB | |
| Bucket 16: 3 params, 24.8 MB | |
| Bucket 17: 3 params, 20.2 MB | |
| Bucket 18: 3 params, 20.2 MB | |
| Bucket 19: 3 params, 20.2 MB | |
| Bucket 20: 9 params, 24.8 MB | |
| Bucket 21: 27 params, 6.8 MB | |
| Registering bucketed gradient hooks... | |
| step:0/1770 val_loss:10.8258 train_time:0ms step_avg:0.02ms | |
| step:1/1770 train_time:115ms step_avg:114.87ms | |
| step:2/1770 train_time:213ms step_avg:106.31ms | |
| step:3/1770 train_time:313ms step_avg:104.17ms | |
| step:4/1770 train_time:414ms step_avg:103.61ms | |
| step:5/1770 train_time:517ms step_avg:103.42ms | |
| step:6/1770 train_time:618ms step_avg:103.06ms | |
| step:7/1770 train_time:720ms step_avg:102.92ms | |
| step:8/1770 train_time:822ms step_avg:102.81ms | |
| step:9/1770 train_time:924ms step_avg:102.62ms | |
| step:10/1770 train_time:1024ms step_avg:102.44ms | |
| step:11/1770 train_time:1125ms step_avg:102.30ms | |
| step:12/1770 train_time:1227ms step_avg:102.26ms | |
| step:13/1770 train_time:1329ms step_avg:102.20ms | |
| step:14/1770 train_time:1430ms step_avg:102.12ms | |
| step:15/1770 train_time:1532ms step_avg:102.14ms | |
| step:16/1770 train_time:1636ms step_avg:102.22ms | |
| step:17/1770 train_time:1739ms step_avg:102.27ms | |
| step:18/1770 train_time:1840ms step_avg:102.24ms | |
| step:19/1770 train_time:1942ms step_avg:102.19ms | |
| step:20/1770 train_time:2043ms step_avg:102.17ms | |
| step:21/1770 train_time:2145ms step_avg:102.13ms | |
| step:22/1770 train_time:2246ms step_avg:102.10ms | |
| step:23/1770 train_time:2347ms step_avg:102.05ms | |
| step:24/1770 train_time:2448ms step_avg:102.00ms | |
| step:25/1770 train_time:2554ms step_avg:102.18ms | |
| step:26/1770 train_time:2656ms step_avg:102.14ms | |
| step:27/1770 train_time:2757ms step_avg:102.11ms | |
| step:28/1770 train_time:2858ms step_avg:102.06ms | |
| step:29/1770 train_time:2958ms step_avg:102.01ms | |
| step:30/1770 train_time:3061ms step_avg:102.03ms | |
| step:31/1770 train_time:3160ms step_avg:101.95ms | |
| step:32/1770 train_time:3262ms step_avg:101.93ms | |
| step:33/1770 train_time:3363ms step_avg:101.91ms | |
| step:34/1770 train_time:3464ms step_avg:101.88ms | |
| step:35/1770 train_time:3565ms step_avg:101.87ms | |
| step:36/1770 train_time:3667ms step_avg:101.86ms | |
| step:37/1770 train_time:3768ms step_avg:101.84ms | |
| step:38/1770 train_time:3869ms step_avg:101.83ms | |
| step:39/1770 train_time:3971ms step_avg:101.83ms | |
| step:40/1770 train_time:4073ms step_avg:101.82ms | |
| step:41/1770 train_time:4175ms step_avg:101.83ms | |
| step:42/1770 train_time:4277ms step_avg:101.83ms | |
| step:43/1770 train_time:4378ms step_avg:101.81ms | |
| step:44/1770 train_time:4479ms step_avg:101.80ms | |
| step:45/1770 train_time:4580ms step_avg:101.78ms | |
| step:46/1770 train_time:4681ms step_avg:101.77ms | |
| step:47/1770 train_time:4783ms step_avg:101.76ms | |
| step:48/1770 train_time:4884ms step_avg:101.76ms | |
| step:49/1770 train_time:4986ms step_avg:101.75ms | |
| step:50/1770 train_time:5088ms step_avg:101.75ms | |
| step:51/1770 train_time:5189ms step_avg:101.75ms | |
| step:52/1770 train_time:5291ms step_avg:101.76ms | |
| step:53/1770 train_time:5394ms step_avg:101.77ms | |
| step:54/1770 train_time:5496ms step_avg:101.78ms | |
| step:55/1770 train_time:5598ms step_avg:101.77ms | |
| step:56/1770 train_time:5699ms step_avg:101.77ms | |
| step:57/1770 train_time:5800ms step_avg:101.76ms | |
| step:58/1770 train_time:5902ms step_avg:101.76ms | |
| step:59/1770 train_time:6004ms step_avg:101.76ms | |
| step:60/1770 train_time:6105ms step_avg:101.75ms | |
| step:61/1770 train_time:6206ms step_avg:101.74ms | |
| step:62/1770 train_time:6307ms step_avg:101.73ms | |
| step:63/1770 train_time:6409ms step_avg:101.72ms | |
| step:64/1770 train_time:6513ms step_avg:101.77ms | |
| step:65/1770 train_time:6612ms step_avg:101.72ms | |
| step:66/1770 train_time:6714ms step_avg:101.73ms | |
| step:67/1770 train_time:6816ms step_avg:101.73ms | |
| step:68/1770 train_time:6918ms step_avg:101.73ms | |
| step:69/1770 train_time:7019ms step_avg:101.73ms | |
| step:70/1770 train_time:7121ms step_avg:101.73ms | |
| step:71/1770 train_time:7225ms step_avg:101.76ms | |
| step:72/1770 train_time:7325ms step_avg:101.74ms | |
| step:73/1770 train_time:7427ms step_avg:101.74ms | |
| step:74/1770 train_time:7529ms step_avg:101.74ms | |
| step:75/1770 train_time:7630ms step_avg:101.74ms | |
| step:76/1770 train_time:7734ms step_avg:101.76ms | |
| step:77/1770 train_time:7835ms step_avg:101.75ms | |
| step:78/1770 train_time:7937ms step_avg:101.76ms | |
| step:79/1770 train_time:8039ms step_avg:101.76ms | |
| step:80/1770 train_time:8141ms step_avg:101.76ms | |
| step:81/1770 train_time:8243ms step_avg:101.76ms | |
| step:82/1770 train_time:8345ms step_avg:101.77ms | |
| step:83/1770 train_time:8445ms step_avg:101.75ms | |
| step:84/1770 train_time:8546ms step_avg:101.74ms | |
| step:85/1770 train_time:8648ms step_avg:101.74ms | |
| step:86/1770 train_time:8749ms step_avg:101.73ms | |
| step:87/1770 train_time:8851ms step_avg:101.73ms | |
| step:88/1770 train_time:8953ms step_avg:101.74ms | |
| step:89/1770 train_time:9055ms step_avg:101.74ms | |
| step:90/1770 train_time:9156ms step_avg:101.74ms | |
| step:91/1770 train_time:9259ms step_avg:101.75ms | |
| step:92/1770 train_time:9358ms step_avg:101.72ms | |
| step:93/1770 train_time:9460ms step_avg:101.72ms | |
| step:94/1770 train_time:9561ms step_avg:101.72ms | |
| step:95/1770 train_time:9663ms step_avg:101.71ms | |
| step:96/1770 train_time:9763ms step_avg:101.70ms | |
| step:97/1770 train_time:9864ms step_avg:101.70ms | |
| step:98/1770 train_time:9965ms step_avg:101.69ms | |
| step:99/1770 train_time:10067ms step_avg:101.69ms | |
| step:100/1770 train_time:10169ms step_avg:101.69ms | |
| step:101/1770 train_time:10272ms step_avg:101.70ms | |
| step:102/1770 train_time:10373ms step_avg:101.70ms | |
| step:103/1770 train_time:10479ms step_avg:101.74ms | |
| step:104/1770 train_time:10582ms step_avg:101.75ms | |
| step:105/1770 train_time:10681ms step_avg:101.73ms | |
| step:106/1770 train_time:10785ms step_avg:101.74ms | |
| step:107/1770 train_time:10882ms step_avg:101.70ms | |
| step:108/1770 train_time:10984ms step_avg:101.70ms | |
| step:109/1770 train_time:11085ms step_avg:101.70ms | |
| step:110/1770 train_time:11186ms step_avg:101.69ms | |
| step:111/1770 train_time:11287ms step_avg:101.69ms | |
| step:112/1770 train_time:11388ms step_avg:101.68ms | |
| step:113/1770 train_time:11490ms step_avg:101.68ms | |
| step:114/1770 train_time:11592ms step_avg:101.68ms | |
| step:115/1770 train_time:11695ms step_avg:101.69ms | |
| step:116/1770 train_time:11797ms step_avg:101.70ms | |
| step:117/1770 train_time:11898ms step_avg:101.69ms | |
| step:118/1770 train_time:11999ms step_avg:101.69ms | |
| step:119/1770 train_time:12100ms step_avg:101.68ms | |
| step:120/1770 train_time:12202ms step_avg:101.68ms | |
| step:121/1770 train_time:12303ms step_avg:101.68ms | |
| step:122/1770 train_time:12404ms step_avg:101.68ms | |
| step:123/1770 train_time:12506ms step_avg:101.67ms | |
| step:124/1770 train_time:12607ms step_avg:101.67ms | |
| step:125/1770 train_time:12708ms step_avg:101.67ms | |
| step:125/1770 val_loss:5.5387 train_time:12712ms step_avg:101.70ms | |
| step:126/1770 train_time:12814ms step_avg:101.70ms | |
| step:127/1770 train_time:12915ms step_avg:101.69ms | |
| step:128/1770 train_time:13017ms step_avg:101.69ms | |
| step:129/1770 train_time:13118ms step_avg:101.69ms | |
| step:130/1770 train_time:13219ms step_avg:101.69ms | |
| step:131/1770 train_time:13321ms step_avg:101.69ms | |
| step:132/1770 train_time:13422ms step_avg:101.68ms | |
| step:133/1770 train_time:13524ms step_avg:101.69ms | |
| step:134/1770 train_time:13626ms step_avg:101.69ms | |
| step:135/1770 train_time:13728ms step_avg:101.69ms | |
| step:136/1770 train_time:13832ms step_avg:101.70ms | |
| step:137/1770 train_time:13934ms step_avg:101.71ms | |
| step:138/1770 train_time:14036ms step_avg:101.71ms | |
| step:139/1770 train_time:14138ms step_avg:101.71ms | |
| step:140/1770 train_time:14240ms step_avg:101.71ms | |
| step:141/1770 train_time:14342ms step_avg:101.72ms | |
| step:142/1770 train_time:14444ms step_avg:101.72ms | |
| step:143/1770 train_time:14545ms step_avg:101.71ms | |
| step:144/1770 train_time:14647ms step_avg:101.71ms | |
| step:145/1770 train_time:14749ms step_avg:101.72ms | |
| step:146/1770 train_time:14851ms step_avg:101.72ms | |
| step:147/1770 train_time:14954ms step_avg:101.73ms | |
| step:148/1770 train_time:15056ms step_avg:101.73ms | |
| step:149/1770 train_time:15158ms step_avg:101.73ms | |
| step:150/1770 train_time:15260ms step_avg:101.74ms | |
| step:151/1770 train_time:15362ms step_avg:101.74ms | |
| step:152/1770 train_time:15464ms step_avg:101.74ms | |
| step:153/1770 train_time:15566ms step_avg:101.74ms | |
| step:154/1770 train_time:15668ms step_avg:101.74ms | |
| step:155/1770 train_time:15770ms step_avg:101.74ms | |
| step:156/1770 train_time:15872ms step_avg:101.74ms | |
| step:157/1770 train_time:15974ms step_avg:101.75ms | |
| step:158/1770 train_time:16077ms step_avg:101.75ms | |
| step:159/1770 train_time:16179ms step_avg:101.75ms | |
| step:160/1770 train_time:16281ms step_avg:101.76ms | |
| step:161/1770 train_time:16383ms step_avg:101.76ms | |
| step:162/1770 train_time:16485ms step_avg:101.76ms | |
| step:163/1770 train_time:16586ms step_avg:101.76ms | |
| step:164/1770 train_time:16688ms step_avg:101.76ms | |
| step:165/1770 train_time:16791ms step_avg:101.76ms | |
| step:166/1770 train_time:16893ms step_avg:101.77ms | |
| step:167/1770 train_time:16999ms step_avg:101.79ms | |
| step:168/1770 train_time:17102ms step_avg:101.80ms | |
| step:169/1770 train_time:17203ms step_avg:101.79ms | |
| step:170/1770 train_time:17302ms step_avg:101.78ms | |
| step:171/1770 train_time:17404ms step_avg:101.78ms | |
| step:172/1770 train_time:17505ms step_avg:101.77ms | |
| step:173/1770 train_time:17607ms step_avg:101.77ms | |
| step:174/1770 train_time:17709ms step_avg:101.78ms | |
| step:175/1770 train_time:17812ms step_avg:101.78ms | |
| step:176/1770 train_time:17915ms step_avg:101.79ms | |
| step:177/1770 train_time:18017ms step_avg:101.79ms | |
| step:178/1770 train_time:18119ms step_avg:101.79ms | |
| step:179/1770 train_time:18221ms step_avg:101.79ms | |
| step:180/1770 train_time:18323ms step_avg:101.79ms | |
| step:181/1770 train_time:18425ms step_avg:101.80ms | |
| step:182/1770 train_time:18527ms step_avg:101.80ms | |
| step:183/1770 train_time:18629ms step_avg:101.80ms | |
| step:184/1770 train_time:18732ms step_avg:101.80ms | |
| step:185/1770 train_time:18835ms step_avg:101.81ms | |
| step:186/1770 train_time:18937ms step_avg:101.81ms | |
| step:187/1770 train_time:19039ms step_avg:101.82ms | |
| step:188/1770 train_time:19141ms step_avg:101.82ms | |
| step:189/1770 train_time:19243ms step_avg:101.82ms | |
| step:190/1770 train_time:19345ms step_avg:101.82ms | |
| step:191/1770 train_time:19446ms step_avg:101.81ms | |
| step:192/1770 train_time:19548ms step_avg:101.81ms | |
| step:193/1770 train_time:19651ms step_avg:101.82ms | |
| step:194/1770 train_time:19753ms step_avg:101.82ms | |
| step:195/1770 train_time:19857ms step_avg:101.83ms | |
| step:196/1770 train_time:19959ms step_avg:101.83ms | |
| step:197/1770 train_time:20062ms step_avg:101.84ms | |
| step:198/1770 train_time:20163ms step_avg:101.83ms | |
| step:199/1770 train_time:20264ms step_avg:101.83ms | |
| step:200/1770 train_time:20366ms step_avg:101.83ms | |
| step:201/1770 train_time:20468ms step_avg:101.83ms | |
| step:202/1770 train_time:20570ms step_avg:101.83ms | |
| step:203/1770 train_time:20672ms step_avg:101.83ms | |
| step:204/1770 train_time:20774ms step_avg:101.83ms | |
| step:205/1770 train_time:20876ms step_avg:101.83ms | |
| step:206/1770 train_time:20978ms step_avg:101.83ms | |
| step:207/1770 train_time:21080ms step_avg:101.83ms | |
| step:208/1770 train_time:21182ms step_avg:101.84ms | |
| step:209/1770 train_time:21283ms step_avg:101.83ms | |
| step:210/1770 train_time:21385ms step_avg:101.83ms | |
| step:211/1770 train_time:21487ms step_avg:101.84ms | |
| step:212/1770 train_time:21590ms step_avg:101.84ms | |
| step:213/1770 train_time:21693ms step_avg:101.84ms | |
| step:214/1770 train_time:21795ms step_avg:101.85ms | |
| step:215/1770 train_time:21897ms step_avg:101.85ms | |
| step:216/1770 train_time:21999ms step_avg:101.85ms | |
| step:217/1770 train_time:22101ms step_avg:101.85ms | |
| step:218/1770 train_time:22203ms step_avg:101.85ms | |
| step:219/1770 train_time:22305ms step_avg:101.85ms | |
| step:220/1770 train_time:22407ms step_avg:101.85ms | |
| step:221/1770 train_time:22509ms step_avg:101.85ms | |
| step:222/1770 train_time:22612ms step_avg:101.85ms | |
| step:223/1770 train_time:22713ms step_avg:101.85ms | |
| step:224/1770 train_time:22815ms step_avg:101.85ms | |
| step:225/1770 train_time:22917ms step_avg:101.85ms | |
| step:226/1770 train_time:23019ms step_avg:101.85ms | |
| step:227/1770 train_time:23121ms step_avg:101.85ms | |
| step:228/1770 train_time:23222ms step_avg:101.85ms | |
| step:229/1770 train_time:23323ms step_avg:101.85ms | |
| step:230/1770 train_time:23425ms step_avg:101.85ms | |
| step:231/1770 train_time:23527ms step_avg:101.85ms | |
| step:232/1770 train_time:23630ms step_avg:101.85ms | |
| step:233/1770 train_time:23732ms step_avg:101.85ms | |
| step:234/1770 train_time:23835ms step_avg:101.86ms | |
| step:235/1770 train_time:23938ms step_avg:101.86ms | |
| step:236/1770 train_time:24039ms step_avg:101.86ms | |
| step:237/1770 train_time:24141ms step_avg:101.86ms | |
| step:238/1770 train_time:24244ms step_avg:101.86ms | |
| step:239/1770 train_time:24346ms step_avg:101.86ms | |
| step:240/1770 train_time:24447ms step_avg:101.86ms | |
| step:241/1770 train_time:24549ms step_avg:101.86ms | |
| step:242/1770 train_time:24652ms step_avg:101.87ms | |
| step:243/1770 train_time:24754ms step_avg:101.87ms | |
| step:244/1770 train_time:24857ms step_avg:101.87ms | |
| step:245/1770 train_time:24963ms step_avg:101.89ms | |
| step:246/1770 train_time:25061ms step_avg:101.88ms | |
| step:247/1770 train_time:25163ms step_avg:101.87ms | |
| step:248/1770 train_time:25264ms step_avg:101.87ms | |
| step:249/1770 train_time:25366ms step_avg:101.87ms | |
| step:250/1770 train_time:25469ms step_avg:101.88ms | |
| step:250/1770 val_loss:4.9697 train_time:25474ms step_avg:101.89ms | |
| step:251/1770 train_time:25576ms step_avg:101.90ms | |
| step:252/1770 train_time:25678ms step_avg:101.90ms | |
| step:253/1770 train_time:25780ms step_avg:101.90ms | |
| step:254/1770 train_time:25882ms step_avg:101.90ms | |
| step:255/1770 train_time:25983ms step_avg:101.90ms | |
| step:256/1770 train_time:26085ms step_avg:101.89ms | |
| step:257/1770 train_time:26187ms step_avg:101.89ms | |
| step:258/1770 train_time:26288ms step_avg:101.89ms | |
| step:259/1770 train_time:26390ms step_avg:101.89ms | |
| step:260/1770 train_time:26493ms step_avg:101.90ms | |
| step:261/1770 train_time:26595ms step_avg:101.90ms | |
| step:262/1770 train_time:26697ms step_avg:101.90ms | |
| step:263/1770 train_time:26799ms step_avg:101.90ms | |
| step:264/1770 train_time:26901ms step_avg:101.90ms | |
| step:265/1770 train_time:27003ms step_avg:101.90ms | |
| step:266/1770 train_time:27106ms step_avg:101.90ms | |
| step:267/1770 train_time:27208ms step_avg:101.90ms | |
| step:268/1770 train_time:27310ms step_avg:101.90ms | |
| step:269/1770 train_time:27413ms step_avg:101.91ms | |
| step:270/1770 train_time:27516ms step_avg:101.91ms | |
| step:271/1770 train_time:27619ms step_avg:101.92ms | |
| step:272/1770 train_time:27722ms step_avg:101.92ms | |
| step:273/1770 train_time:27825ms step_avg:101.92ms | |
| step:274/1770 train_time:27928ms step_avg:101.93ms | |
| step:275/1770 train_time:28030ms step_avg:101.93ms | |
| step:276/1770 train_time:28134ms step_avg:101.93ms | |
| step:277/1770 train_time:28237ms step_avg:101.94ms | |
| step:278/1770 train_time:28339ms step_avg:101.94ms | |
| step:279/1770 train_time:28442ms step_avg:101.94ms | |
| step:280/1770 train_time:28545ms step_avg:101.95ms | |
| step:281/1770 train_time:28647ms step_avg:101.95ms | |
| step:282/1770 train_time:28750ms step_avg:101.95ms | |
| step:283/1770 train_time:28853ms step_avg:101.96ms | |
| step:284/1770 train_time:28956ms step_avg:101.96ms | |
| step:285/1770 train_time:29059ms step_avg:101.96ms | |
| step:286/1770 train_time:29162ms step_avg:101.97ms | |
| step:287/1770 train_time:29265ms step_avg:101.97ms | |
| step:288/1770 train_time:29368ms step_avg:101.97ms | |
| step:289/1770 train_time:29470ms step_avg:101.97ms | |
| step:290/1770 train_time:29577ms step_avg:101.99ms | |
| step:291/1770 train_time:29681ms step_avg:102.00ms | |
| step:292/1770 train_time:29780ms step_avg:101.99ms | |
| step:293/1770 train_time:29882ms step_avg:101.99ms | |
| step:294/1770 train_time:29984ms step_avg:101.99ms | |
| step:295/1770 train_time:30087ms step_avg:101.99ms | |
| step:296/1770 train_time:30189ms step_avg:101.99ms | |
| step:297/1770 train_time:30291ms step_avg:101.99ms | |
| step:298/1770 train_time:30395ms step_avg:102.00ms | |
| step:299/1770 train_time:30498ms step_avg:102.00ms | |
| step:300/1770 train_time:30600ms step_avg:102.00ms | |
| step:301/1770 train_time:30703ms step_avg:102.00ms | |
| step:302/1770 train_time:30805ms step_avg:102.00ms | |
| step:303/1770 train_time:30908ms step_avg:102.01ms | |
| step:304/1770 train_time:31011ms step_avg:102.01ms | |
| step:305/1770 train_time:31113ms step_avg:102.01ms | |
| step:306/1770 train_time:31216ms step_avg:102.01ms | |
| step:307/1770 train_time:31319ms step_avg:102.01ms | |
| step:308/1770 train_time:31421ms step_avg:102.02ms | |
| step:309/1770 train_time:31523ms step_avg:102.02ms | |
| step:310/1770 train_time:31626ms step_avg:102.02ms | |
| step:311/1770 train_time:31728ms step_avg:102.02ms | |
| step:312/1770 train_time:31831ms step_avg:102.02ms | |
| step:313/1770 train_time:31934ms step_avg:102.03ms | |
| step:314/1770 train_time:32037ms step_avg:102.03ms | |
| step:315/1770 train_time:32140ms step_avg:102.03ms | |
| step:316/1770 train_time:32242ms step_avg:102.03ms | |
| step:317/1770 train_time:32345ms step_avg:102.04ms | |
| step:318/1770 train_time:32448ms step_avg:102.04ms | |
| step:319/1770 train_time:32550ms step_avg:102.04ms | |
| step:320/1770 train_time:32653ms step_avg:102.04ms | |
| step:321/1770 train_time:32755ms step_avg:102.04ms | |
| step:322/1770 train_time:32858ms step_avg:102.04ms | |
| step:323/1770 train_time:32960ms step_avg:102.04ms | |
| step:324/1770 train_time:33063ms step_avg:102.05ms | |
| step:325/1770 train_time:33166ms step_avg:102.05ms | |
| step:326/1770 train_time:33268ms step_avg:102.05ms | |
| step:327/1770 train_time:33371ms step_avg:102.05ms | |
| step:328/1770 train_time:33478ms step_avg:102.07ms | |
| step:329/1770 train_time:33577ms step_avg:102.06ms | |
| step:330/1770 train_time:33679ms step_avg:102.06ms | |
| step:331/1770 train_time:33781ms step_avg:102.06ms | |
| step:332/1770 train_time:33884ms step_avg:102.06ms | |
| step:333/1770 train_time:33986ms step_avg:102.06ms | |
| step:334/1770 train_time:34089ms step_avg:102.06ms | |
| step:335/1770 train_time:34192ms step_avg:102.07ms | |
| step:336/1770 train_time:34295ms step_avg:102.07ms | |
| step:337/1770 train_time:34397ms step_avg:102.07ms | |
| step:338/1770 train_time:34499ms step_avg:102.07ms | |
| step:339/1770 train_time:34601ms step_avg:102.07ms | |
| step:340/1770 train_time:34704ms step_avg:102.07ms | |
| step:341/1770 train_time:34806ms step_avg:102.07ms | |
| step:342/1770 train_time:34908ms step_avg:102.07ms | |
| step:343/1770 train_time:35012ms step_avg:102.07ms | |
| step:344/1770 train_time:35115ms step_avg:102.08ms | |
| step:345/1770 train_time:35218ms step_avg:102.08ms | |
| step:346/1770 train_time:35321ms step_avg:102.08ms | |
| step:347/1770 train_time:35424ms step_avg:102.09ms | |
| step:348/1770 train_time:35526ms step_avg:102.09ms | |
| step:349/1770 train_time:35630ms step_avg:102.09ms | |
| step:350/1770 train_time:35732ms step_avg:102.09ms | |
| step:351/1770 train_time:35835ms step_avg:102.10ms | |
| step:352/1770 train_time:35938ms step_avg:102.10ms | |
| step:353/1770 train_time:36040ms step_avg:102.10ms | |
| step:354/1770 train_time:36143ms step_avg:102.10ms | |
| step:355/1770 train_time:36245ms step_avg:102.10ms | |
| step:356/1770 train_time:36348ms step_avg:102.10ms | |
| step:357/1770 train_time:36451ms step_avg:102.10ms | |
| step:358/1770 train_time:36553ms step_avg:102.10ms | |
| step:359/1770 train_time:36655ms step_avg:102.10ms | |
| step:360/1770 train_time:36758ms step_avg:102.10ms | |
| step:361/1770 train_time:36860ms step_avg:102.11ms | |
| step:362/1770 train_time:36962ms step_avg:102.11ms | |
| step:363/1770 train_time:37065ms step_avg:102.11ms | |
| step:364/1770 train_time:37167ms step_avg:102.11ms | |
| step:365/1770 train_time:37270ms step_avg:102.11ms | |
| step:366/1770 train_time:37378ms step_avg:102.13ms | |
| step:367/1770 train_time:37478ms step_avg:102.12ms | |
| step:368/1770 train_time:37580ms step_avg:102.12ms | |
| step:369/1770 train_time:37682ms step_avg:102.12ms | |
| step:370/1770 train_time:37785ms step_avg:102.12ms | |
| step:371/1770 train_time:37887ms step_avg:102.12ms | |
| step:372/1770 train_time:37989ms step_avg:102.12ms | |
| step:373/1770 train_time:38093ms step_avg:102.13ms | |
| step:374/1770 train_time:38195ms step_avg:102.13ms | |
| step:375/1770 train_time:38298ms step_avg:102.13ms | |
| step:375/1770 val_loss:4.6824 train_time:38302ms step_avg:102.14ms | |
| step:376/1770 train_time:38404ms step_avg:102.14ms | |
| step:377/1770 train_time:38506ms step_avg:102.14ms | |
| step:378/1770 train_time:38609ms step_avg:102.14ms | |
| step:379/1770 train_time:38711ms step_avg:102.14ms | |
| step:380/1770 train_time:38813ms step_avg:102.14ms | |
| step:381/1770 train_time:38916ms step_avg:102.14ms | |
| step:382/1770 train_time:39019ms step_avg:102.14ms | |
| step:383/1770 train_time:39122ms step_avg:102.14ms | |
| step:384/1770 train_time:39224ms step_avg:102.15ms | |
| step:385/1770 train_time:39327ms step_avg:102.15ms | |
| step:386/1770 train_time:39430ms step_avg:102.15ms | |
| step:387/1770 train_time:39532ms step_avg:102.15ms | |
| step:388/1770 train_time:39633ms step_avg:102.15ms | |
| step:389/1770 train_time:39736ms step_avg:102.15ms | |
| step:390/1770 train_time:39838ms step_avg:102.15ms | |
| step:391/1770 train_time:39942ms step_avg:102.15ms | |
| step:392/1770 train_time:40045ms step_avg:102.16ms | |
| step:393/1770 train_time:40147ms step_avg:102.16ms | |
| step:394/1770 train_time:40249ms step_avg:102.15ms | |
| step:395/1770 train_time:40354ms step_avg:102.16ms | |
| step:396/1770 train_time:40458ms step_avg:102.17ms | |
| step:397/1770 train_time:40565ms step_avg:102.18ms | |
| step:398/1770 train_time:40668ms step_avg:102.18ms | |
| step:399/1770 train_time:40772ms step_avg:102.19ms | |
| step:400/1770 train_time:40877ms step_avg:102.19ms | |
| step:401/1770 train_time:40981ms step_avg:102.20ms | |
| step:402/1770 train_time:41086ms step_avg:102.20ms | |
| step:403/1770 train_time:41191ms step_avg:102.21ms | |
| step:404/1770 train_time:41295ms step_avg:102.22ms | |
| step:405/1770 train_time:41399ms step_avg:102.22ms | |
| step:406/1770 train_time:41504ms step_avg:102.23ms | |
| step:407/1770 train_time:41609ms step_avg:102.23ms | |
| step:408/1770 train_time:41713ms step_avg:102.24ms | |
| step:409/1770 train_time:41817ms step_avg:102.24ms | |
| step:410/1770 train_time:41921ms step_avg:102.25ms | |
| step:411/1770 train_time:42026ms step_avg:102.25ms | |
| step:412/1770 train_time:42132ms step_avg:102.26ms | |
| step:413/1770 train_time:42236ms step_avg:102.27ms | |
| step:414/1770 train_time:42340ms step_avg:102.27ms | |
| step:415/1770 train_time:42445ms step_avg:102.28ms | |
| step:416/1770 train_time:42549ms step_avg:102.28ms | |
| step:417/1770 train_time:42654ms step_avg:102.29ms | |
| step:418/1770 train_time:42758ms step_avg:102.29ms | |
| step:419/1770 train_time:42863ms step_avg:102.30ms | |
| step:420/1770 train_time:42968ms step_avg:102.30ms | |
| step:421/1770 train_time:43072ms step_avg:102.31ms | |
| step:422/1770 train_time:43176ms step_avg:102.31ms | |
| step:423/1770 train_time:43281ms step_avg:102.32ms | |
| step:424/1770 train_time:43386ms step_avg:102.32ms | |
| step:425/1770 train_time:43490ms step_avg:102.33ms | |
| step:426/1770 train_time:43595ms step_avg:102.34ms | |
| step:427/1770 train_time:43699ms step_avg:102.34ms | |
| step:428/1770 train_time:43805ms step_avg:102.35ms | |
| step:429/1770 train_time:43909ms step_avg:102.35ms | |
| step:430/1770 train_time:44014ms step_avg:102.36ms | |
| step:431/1770 train_time:44118ms step_avg:102.36ms | |
| step:432/1770 train_time:44223ms step_avg:102.37ms | |
| step:433/1770 train_time:44327ms step_avg:102.37ms | |
| step:434/1770 train_time:44432ms step_avg:102.38ms | |
| step:435/1770 train_time:44536ms step_avg:102.38ms | |
| step:436/1770 train_time:44640ms step_avg:102.39ms | |
| step:437/1770 train_time:44745ms step_avg:102.39ms | |
| step:438/1770 train_time:44849ms step_avg:102.39ms | |
| step:439/1770 train_time:44953ms step_avg:102.40ms | |
| step:440/1770 train_time:45056ms step_avg:102.40ms | |
| step:441/1770 train_time:45161ms step_avg:102.41ms | |
| step:442/1770 train_time:45266ms step_avg:102.41ms | |
| step:443/1770 train_time:45371ms step_avg:102.42ms | |
| step:444/1770 train_time:45476ms step_avg:102.42ms | |
| step:445/1770 train_time:45580ms step_avg:102.43ms | |
| step:446/1770 train_time:45685ms step_avg:102.43ms | |
| step:447/1770 train_time:45789ms step_avg:102.44ms | |
| step:448/1770 train_time:45894ms step_avg:102.44ms | |
| step:449/1770 train_time:45998ms step_avg:102.44ms | |
| step:450/1770 train_time:46104ms step_avg:102.45ms | |
| step:451/1770 train_time:46207ms step_avg:102.45ms | |
| step:452/1770 train_time:46311ms step_avg:102.46ms | |
| step:453/1770 train_time:46415ms step_avg:102.46ms | |
| step:454/1770 train_time:46520ms step_avg:102.47ms | |
| step:455/1770 train_time:46625ms step_avg:102.47ms | |
| step:456/1770 train_time:46730ms step_avg:102.48ms | |
| step:457/1770 train_time:46834ms step_avg:102.48ms | |
| step:458/1770 train_time:46939ms step_avg:102.49ms | |
| step:459/1770 train_time:47044ms step_avg:102.49ms | |
| step:460/1770 train_time:47149ms step_avg:102.50ms | |
| step:461/1770 train_time:47253ms step_avg:102.50ms | |
| step:462/1770 train_time:47358ms step_avg:102.51ms | |
| step:463/1770 train_time:47462ms step_avg:102.51ms | |
| step:464/1770 train_time:47567ms step_avg:102.51ms | |
| step:465/1770 train_time:47671ms step_avg:102.52ms | |
| step:466/1770 train_time:47776ms step_avg:102.52ms | |
| step:467/1770 train_time:47880ms step_avg:102.53ms | |
| step:468/1770 train_time:47986ms step_avg:102.53ms | |
| step:469/1770 train_time:48090ms step_avg:102.54ms | |
| step:470/1770 train_time:48195ms step_avg:102.54ms | |
| step:471/1770 train_time:48299ms step_avg:102.54ms | |
| step:472/1770 train_time:48402ms step_avg:102.55ms | |
| step:473/1770 train_time:48506ms step_avg:102.55ms | |
| step:474/1770 train_time:48610ms step_avg:102.55ms | |
| step:475/1770 train_time:48714ms step_avg:102.56ms | |
| step:476/1770 train_time:48820ms step_avg:102.56ms | |
| step:477/1770 train_time:48924ms step_avg:102.57ms | |
| step:478/1770 train_time:49029ms step_avg:102.57ms | |
| step:479/1770 train_time:49133ms step_avg:102.57ms | |
| step:480/1770 train_time:49238ms step_avg:102.58ms | |
| step:481/1770 train_time:49343ms step_avg:102.58ms | |
| step:482/1770 train_time:49447ms step_avg:102.59ms | |
| step:483/1770 train_time:49551ms step_avg:102.59ms | |
| step:484/1770 train_time:49655ms step_avg:102.59ms | |
| step:485/1770 train_time:49760ms step_avg:102.60ms | |
| step:486/1770 train_time:49865ms step_avg:102.60ms | |
| step:487/1770 train_time:49970ms step_avg:102.61ms | |
| step:488/1770 train_time:50074ms step_avg:102.61ms | |
| step:489/1770 train_time:50178ms step_avg:102.61ms | |
| step:490/1770 train_time:50283ms step_avg:102.62ms | |
| step:491/1770 train_time:50387ms step_avg:102.62ms | |
| step:492/1770 train_time:50492ms step_avg:102.63ms | |
| step:493/1770 train_time:50598ms step_avg:102.63ms | |
| step:494/1770 train_time:50704ms step_avg:102.64ms | |
| step:495/1770 train_time:50807ms step_avg:102.64ms | |
| step:496/1770 train_time:50913ms step_avg:102.65ms | |
| step:497/1770 train_time:51015ms step_avg:102.65ms | |
| step:498/1770 train_time:51120ms step_avg:102.65ms | |
| step:499/1770 train_time:51225ms step_avg:102.65ms | |
| step:500/1770 train_time:51333ms step_avg:102.67ms | |
| step:500/1770 val_loss:4.4969 train_time:51334ms step_avg:102.67ms | |
| step:501/1770 train_time:51438ms step_avg:102.67ms | |
| step:502/1770 train_time:51544ms step_avg:102.68ms | |
| step:503/1770 train_time:51649ms step_avg:102.68ms | |
| step:504/1770 train_time:51753ms step_avg:102.69ms | |
| step:505/1770 train_time:51858ms step_avg:102.69ms | |
| step:506/1770 train_time:51963ms step_avg:102.69ms | |
| step:507/1770 train_time:52067ms step_avg:102.70ms | |
| step:508/1770 train_time:52172ms step_avg:102.70ms | |
| step:509/1770 train_time:52276ms step_avg:102.70ms | |
| step:510/1770 train_time:52381ms step_avg:102.71ms | |
| step:511/1770 train_time:52485ms step_avg:102.71ms | |
| step:512/1770 train_time:52590ms step_avg:102.71ms | |
| step:513/1770 train_time:52695ms step_avg:102.72ms | |
| step:514/1770 train_time:52800ms step_avg:102.72ms | |
| step:515/1770 train_time:52904ms step_avg:102.73ms | |
| step:516/1770 train_time:53010ms step_avg:102.73ms | |
| step:517/1770 train_time:53114ms step_avg:102.73ms | |
| step:518/1770 train_time:53219ms step_avg:102.74ms | |
| step:519/1770 train_time:53323ms step_avg:102.74ms | |
| step:520/1770 train_time:53427ms step_avg:102.74ms | |
| step:521/1770 train_time:53532ms step_avg:102.75ms | |
| step:522/1770 train_time:53636ms step_avg:102.75ms | |
| step:523/1770 train_time:53741ms step_avg:102.76ms | |
| step:524/1770 train_time:53845ms step_avg:102.76ms | |
| step:525/1770 train_time:53950ms step_avg:102.76ms | |
| step:526/1770 train_time:54054ms step_avg:102.76ms | |
| step:527/1770 train_time:54159ms step_avg:102.77ms | |
| step:528/1770 train_time:54264ms step_avg:102.77ms | |
| step:529/1770 train_time:54369ms step_avg:102.78ms | |
| step:530/1770 train_time:54474ms step_avg:102.78ms | |
| step:531/1770 train_time:54579ms step_avg:102.78ms | |
| step:532/1770 train_time:54684ms step_avg:102.79ms | |
| step:533/1770 train_time:54789ms step_avg:102.79ms | |
| step:534/1770 train_time:54894ms step_avg:102.80ms | |
| step:535/1770 train_time:54998ms step_avg:102.80ms | |
| step:536/1770 train_time:55103ms step_avg:102.80ms | |
| step:537/1770 train_time:55208ms step_avg:102.81ms | |
| step:538/1770 train_time:55313ms step_avg:102.81ms | |
| step:539/1770 train_time:55418ms step_avg:102.82ms | |
| step:540/1770 train_time:55523ms step_avg:102.82ms | |
| step:541/1770 train_time:55628ms step_avg:102.83ms | |
| step:542/1770 train_time:55733ms step_avg:102.83ms | |
| step:543/1770 train_time:55839ms step_avg:102.83ms | |
| step:544/1770 train_time:55944ms step_avg:102.84ms | |
| step:545/1770 train_time:56049ms step_avg:102.84ms | |
| step:546/1770 train_time:56153ms step_avg:102.85ms | |
| step:547/1770 train_time:56259ms step_avg:102.85ms | |
| step:548/1770 train_time:56364ms step_avg:102.85ms | |
| step:549/1770 train_time:56469ms step_avg:102.86ms | |
| step:550/1770 train_time:56573ms step_avg:102.86ms | |
| step:551/1770 train_time:56678ms step_avg:102.86ms | |
| step:552/1770 train_time:56784ms step_avg:102.87ms | |
| step:553/1770 train_time:56889ms step_avg:102.87ms | |
| step:554/1770 train_time:56994ms step_avg:102.88ms | |
| step:555/1770 train_time:57098ms step_avg:102.88ms | |
| step:556/1770 train_time:57203ms step_avg:102.88ms | |
| step:557/1770 train_time:57309ms step_avg:102.89ms | |
| step:558/1770 train_time:57414ms step_avg:102.89ms | |
| step:559/1770 train_time:57519ms step_avg:102.90ms | |
| step:560/1770 train_time:57623ms step_avg:102.90ms | |
| step:561/1770 train_time:57732ms step_avg:102.91ms | |
| step:562/1770 train_time:57834ms step_avg:102.91ms | |
| step:563/1770 train_time:57938ms step_avg:102.91ms | |
| step:564/1770 train_time:58043ms step_avg:102.91ms | |
| step:565/1770 train_time:58148ms step_avg:102.92ms | |
| step:566/1770 train_time:58257ms step_avg:102.93ms | |
| step:567/1770 train_time:58359ms step_avg:102.93ms | |
| step:568/1770 train_time:58464ms step_avg:102.93ms | |
| step:569/1770 train_time:58570ms step_avg:102.93ms | |
| step:570/1770 train_time:58675ms step_avg:102.94ms | |
| step:571/1770 train_time:58780ms step_avg:102.94ms | |
| step:572/1770 train_time:58884ms step_avg:102.94ms | |
| step:573/1770 train_time:58989ms step_avg:102.95ms | |
| step:574/1770 train_time:59094ms step_avg:102.95ms | |
| step:575/1770 train_time:59199ms step_avg:102.96ms | |
| step:576/1770 train_time:59304ms step_avg:102.96ms | |
| step:577/1770 train_time:59409ms step_avg:102.96ms | |
| step:578/1770 train_time:59514ms step_avg:102.97ms | |
| step:579/1770 train_time:59619ms step_avg:102.97ms | |
| step:580/1770 train_time:59731ms step_avg:102.98ms | |
| step:581/1770 train_time:59834ms step_avg:102.98ms | |
| step:582/1770 train_time:59942ms step_avg:102.99ms | |
| step:583/1770 train_time:60044ms step_avg:102.99ms | |
| step:584/1770 train_time:60147ms step_avg:102.99ms | |
| step:585/1770 train_time:60252ms step_avg:102.99ms | |
| step:586/1770 train_time:60358ms step_avg:103.00ms | |
| step:587/1770 train_time:60462ms step_avg:103.00ms | |
| step:588/1770 train_time:60568ms step_avg:103.01ms | |
| step:589/1770 train_time:60673ms step_avg:103.01ms | |
| step:590/1770 train_time:60779ms step_avg:103.02ms | |
| step:591/1770 train_time:60884ms step_avg:103.02ms | |
| step:592/1770 train_time:60990ms step_avg:103.02ms | |
| step:593/1770 train_time:61094ms step_avg:103.03ms | |
| step:594/1770 train_time:61199ms step_avg:103.03ms | |
| step:595/1770 train_time:61303ms step_avg:103.03ms | |
| step:596/1770 train_time:61408ms step_avg:103.03ms | |
| step:597/1770 train_time:61512ms step_avg:103.04ms | |
| step:598/1770 train_time:61617ms step_avg:103.04ms | |
| step:599/1770 train_time:61722ms step_avg:103.04ms | |
| step:600/1770 train_time:61831ms step_avg:103.05ms | |
| step:601/1770 train_time:61933ms step_avg:103.05ms | |
| step:602/1770 train_time:62038ms step_avg:103.05ms | |
| step:603/1770 train_time:62143ms step_avg:103.06ms | |
| step:604/1770 train_time:62248ms step_avg:103.06ms | |
| step:605/1770 train_time:62356ms step_avg:103.07ms | |
| step:606/1770 train_time:62458ms step_avg:103.07ms | |
| step:607/1770 train_time:62562ms step_avg:103.07ms | |
| step:608/1770 train_time:62667ms step_avg:103.07ms | |
| step:609/1770 train_time:62772ms step_avg:103.07ms | |
| step:610/1770 train_time:62877ms step_avg:103.08ms | |
| step:611/1770 train_time:62983ms step_avg:103.08ms | |
| step:612/1770 train_time:63087ms step_avg:103.08ms | |
| step:613/1770 train_time:63192ms step_avg:103.09ms | |
| step:614/1770 train_time:63301ms step_avg:103.10ms | |
| step:615/1770 train_time:63402ms step_avg:103.09ms | |
| step:616/1770 train_time:63506ms step_avg:103.09ms | |
| step:617/1770 train_time:63611ms step_avg:103.10ms | |
| step:618/1770 train_time:63715ms step_avg:103.10ms | |
| step:619/1770 train_time:63820ms step_avg:103.10ms | |
| step:620/1770 train_time:63925ms step_avg:103.10ms | |
| step:621/1770 train_time:64030ms step_avg:103.11ms | |
| step:622/1770 train_time:64135ms step_avg:103.11ms | |
| step:623/1770 train_time:64240ms step_avg:103.11ms | |
| step:624/1770 train_time:64345ms step_avg:103.12ms | |
| step:625/1770 train_time:64450ms step_avg:103.12ms | |
| step:625/1770 val_loss:4.3805 train_time:64454ms step_avg:103.13ms | |
| step:626/1770 train_time:64562ms step_avg:103.13ms | |
| step:627/1770 train_time:64669ms step_avg:103.14ms | |
| step:628/1770 train_time:64773ms step_avg:103.14ms | |
| step:629/1770 train_time:64878ms step_avg:103.14ms | |
| step:630/1770 train_time:64984ms step_avg:103.15ms | |
| step:631/1770 train_time:65088ms step_avg:103.15ms | |
| step:632/1770 train_time:65193ms step_avg:103.15ms | |
| step:633/1770 train_time:65298ms step_avg:103.16ms | |
| step:634/1770 train_time:65403ms step_avg:103.16ms | |
| step:635/1770 train_time:65508ms step_avg:103.16ms | |
| step:636/1770 train_time:65613ms step_avg:103.17ms | |
| step:637/1770 train_time:65719ms step_avg:103.17ms | |
| step:638/1770 train_time:65824ms step_avg:103.17ms | |
| step:639/1770 train_time:65930ms step_avg:103.18ms | |
| step:640/1770 train_time:66035ms step_avg:103.18ms | |
| step:641/1770 train_time:66140ms step_avg:103.18ms | |
| step:642/1770 train_time:66245ms step_avg:103.18ms | |
| step:643/1770 train_time:66351ms step_avg:103.19ms | |
| step:644/1770 train_time:66454ms step_avg:103.19ms | |
| step:645/1770 train_time:66559ms step_avg:103.19ms | |
| step:646/1770 train_time:66665ms step_avg:103.20ms | |
| step:647/1770 train_time:66770ms step_avg:103.20ms | |
| step:648/1770 train_time:66874ms step_avg:103.20ms | |
| step:649/1770 train_time:66979ms step_avg:103.20ms | |
| step:650/1770 train_time:67084ms step_avg:103.21ms | |
| step:651/1770 train_time:67188ms step_avg:103.21ms | |
| step:652/1770 train_time:67293ms step_avg:103.21ms | |
| step:653/1770 train_time:67399ms step_avg:103.21ms | |
| step:654/1770 train_time:67503ms step_avg:103.22ms | |
| step:655/1770 train_time:67608ms step_avg:103.22ms | |
| step:656/1770 train_time:67713ms step_avg:103.22ms | |
| step:657/1770 train_time:67820ms step_avg:103.23ms | |
| step:658/1770 train_time:67927ms step_avg:103.23ms | |
| step:659/1770 train_time:68034ms step_avg:103.24ms | |
| step:660/1770 train_time:68141ms step_avg:103.24ms | |
| step:661/1770 train_time:68247ms step_avg:103.25ms | |
| step:662/1770 train_time:68354ms step_avg:103.25ms | |
| step:663/1770 train_time:68461ms step_avg:103.26ms | |
| step:664/1770 train_time:68566ms step_avg:103.26ms | |
| step:665/1770 train_time:68673ms step_avg:103.27ms | |
| step:666/1770 train_time:68780ms step_avg:103.27ms | |
| step:667/1770 train_time:68887ms step_avg:103.28ms | |
| step:668/1770 train_time:68993ms step_avg:103.28ms | |
| step:669/1770 train_time:69100ms step_avg:103.29ms | |
| step:670/1770 train_time:69207ms step_avg:103.29ms | |
| step:671/1770 train_time:69312ms step_avg:103.30ms | |
| step:672/1770 train_time:69420ms step_avg:103.30ms | |
| step:673/1770 train_time:69527ms step_avg:103.31ms | |
| step:674/1770 train_time:69633ms step_avg:103.31ms | |
| step:675/1770 train_time:69741ms step_avg:103.32ms | |
| step:676/1770 train_time:69848ms step_avg:103.32ms | |
| step:677/1770 train_time:69958ms step_avg:103.33ms | |
| step:678/1770 train_time:70061ms step_avg:103.34ms | |
| step:679/1770 train_time:70170ms step_avg:103.34ms | |
| step:680/1770 train_time:70274ms step_avg:103.34ms | |
| step:681/1770 train_time:70382ms step_avg:103.35ms | |
| step:682/1770 train_time:70490ms step_avg:103.36ms | |
| step:683/1770 train_time:70600ms step_avg:103.37ms | |
| step:684/1770 train_time:70708ms step_avg:103.37ms | |
| step:685/1770 train_time:70814ms step_avg:103.38ms | |
| step:686/1770 train_time:70923ms step_avg:103.39ms | |
| step:687/1770 train_time:71028ms step_avg:103.39ms | |
| step:688/1770 train_time:71134ms step_avg:103.39ms | |
| step:689/1770 train_time:71241ms step_avg:103.40ms | |
| step:690/1770 train_time:71348ms step_avg:103.40ms | |
| step:691/1770 train_time:71454ms step_avg:103.41ms | |
| step:692/1770 train_time:71564ms step_avg:103.42ms | |
| step:693/1770 train_time:71669ms step_avg:103.42ms | |
| step:694/1770 train_time:71775ms step_avg:103.42ms | |
| step:695/1770 train_time:71880ms step_avg:103.43ms | |
| step:696/1770 train_time:71987ms step_avg:103.43ms | |
| step:697/1770 train_time:72094ms step_avg:103.43ms | |
| step:698/1770 train_time:72201ms step_avg:103.44ms | |
| step:699/1770 train_time:72307ms step_avg:103.44ms | |
| step:700/1770 train_time:72413ms step_avg:103.45ms | |
| step:701/1770 train_time:72520ms step_avg:103.45ms | |
| step:702/1770 train_time:72627ms step_avg:103.46ms | |
| step:703/1770 train_time:72733ms step_avg:103.46ms | |
| step:704/1770 train_time:72840ms step_avg:103.47ms | |
| step:705/1770 train_time:72946ms step_avg:103.47ms | |
| step:706/1770 train_time:73053ms step_avg:103.47ms | |
| step:707/1770 train_time:73160ms step_avg:103.48ms | |
| step:708/1770 train_time:73266ms step_avg:103.48ms | |
| step:709/1770 train_time:73371ms step_avg:103.49ms | |
| step:710/1770 train_time:73479ms step_avg:103.49ms | |
| step:711/1770 train_time:73585ms step_avg:103.50ms | |
| step:712/1770 train_time:73692ms step_avg:103.50ms | |
| step:713/1770 train_time:73799ms step_avg:103.50ms | |
| step:714/1770 train_time:73905ms step_avg:103.51ms | |
| step:715/1770 train_time:74015ms step_avg:103.52ms | |
| step:716/1770 train_time:74120ms step_avg:103.52ms | |
| step:717/1770 train_time:74226ms step_avg:103.52ms | |
| step:718/1770 train_time:74332ms step_avg:103.53ms | |
| step:719/1770 train_time:74440ms step_avg:103.53ms | |
| step:720/1770 train_time:74546ms step_avg:103.54ms | |
| step:721/1770 train_time:74653ms step_avg:103.54ms | |
| step:722/1770 train_time:74760ms step_avg:103.55ms | |
| step:723/1770 train_time:74866ms step_avg:103.55ms | |
| step:724/1770 train_time:74971ms step_avg:103.55ms | |
| step:725/1770 train_time:75078ms step_avg:103.56ms | |
| step:726/1770 train_time:75184ms step_avg:103.56ms | |
| step:727/1770 train_time:75291ms step_avg:103.56ms | |
| step:728/1770 train_time:75397ms step_avg:103.57ms | |
| step:729/1770 train_time:75505ms step_avg:103.57ms | |
| step:730/1770 train_time:75612ms step_avg:103.58ms | |
| step:731/1770 train_time:75719ms step_avg:103.58ms | |
| step:732/1770 train_time:75826ms step_avg:103.59ms | |
| step:733/1770 train_time:75933ms step_avg:103.59ms | |
| step:734/1770 train_time:76040ms step_avg:103.60ms | |
| step:735/1770 train_time:76146ms step_avg:103.60ms | |
| step:736/1770 train_time:76258ms step_avg:103.61ms | |
| step:737/1770 train_time:76362ms step_avg:103.61ms | |
| step:738/1770 train_time:76468ms step_avg:103.61ms | |
| step:739/1770 train_time:76573ms step_avg:103.62ms | |
| step:740/1770 train_time:76681ms step_avg:103.62ms | |
| step:741/1770 train_time:76787ms step_avg:103.63ms | |
| step:742/1770 train_time:76894ms step_avg:103.63ms | |
| step:743/1770 train_time:77002ms step_avg:103.64ms | |
| step:744/1770 train_time:77109ms step_avg:103.64ms | |
| step:745/1770 train_time:77215ms step_avg:103.64ms | |
| step:746/1770 train_time:77321ms step_avg:103.65ms | |
| step:747/1770 train_time:77428ms step_avg:103.65ms | |
| step:748/1770 train_time:77535ms step_avg:103.66ms | |
| step:749/1770 train_time:77642ms step_avg:103.66ms | |
| step:750/1770 train_time:77749ms step_avg:103.67ms | |
| step:750/1770 val_loss:4.2975 train_time:77753ms step_avg:103.67ms | |
| step:751/1770 train_time:77861ms step_avg:103.68ms | |
| step:752/1770 train_time:77968ms step_avg:103.68ms | |
| step:753/1770 train_time:78076ms step_avg:103.69ms | |
| step:754/1770 train_time:78182ms step_avg:103.69ms | |
| step:755/1770 train_time:78289ms step_avg:103.69ms | |
| step:756/1770 train_time:78396ms step_avg:103.70ms | |
| step:757/1770 train_time:78503ms step_avg:103.70ms | |
| step:758/1770 train_time:78610ms step_avg:103.71ms | |
| step:759/1770 train_time:78716ms step_avg:103.71ms | |
| step:760/1770 train_time:78823ms step_avg:103.71ms | |
| step:761/1770 train_time:78931ms step_avg:103.72ms | |
| step:762/1770 train_time:79038ms step_avg:103.72ms | |
| step:763/1770 train_time:79145ms step_avg:103.73ms | |
| step:764/1770 train_time:79252ms step_avg:103.73ms | |
| step:765/1770 train_time:79359ms step_avg:103.74ms | |
| step:766/1770 train_time:79466ms step_avg:103.74ms | |
| step:767/1770 train_time:79572ms step_avg:103.74ms | |
| step:768/1770 train_time:79680ms step_avg:103.75ms | |
| step:769/1770 train_time:79786ms step_avg:103.75ms | |
| step:770/1770 train_time:79893ms step_avg:103.76ms | |
| step:771/1770 train_time:80000ms step_avg:103.76ms | |
| step:772/1770 train_time:80106ms step_avg:103.76ms | |
| step:773/1770 train_time:80213ms step_avg:103.77ms | |
| step:774/1770 train_time:80319ms step_avg:103.77ms | |
| step:775/1770 train_time:80426ms step_avg:103.78ms | |
| step:776/1770 train_time:80533ms step_avg:103.78ms | |
| step:777/1770 train_time:80640ms step_avg:103.78ms | |
| step:778/1770 train_time:80747ms step_avg:103.79ms | |
| step:779/1770 train_time:80853ms step_avg:103.79ms | |
| step:780/1770 train_time:80960ms step_avg:103.80ms | |
| step:781/1770 train_time:81068ms step_avg:103.80ms | |
| step:782/1770 train_time:81174ms step_avg:103.80ms | |
| step:783/1770 train_time:81281ms step_avg:103.81ms | |
| step:784/1770 train_time:81388ms step_avg:103.81ms | |
| step:785/1770 train_time:81495ms step_avg:103.81ms | |
| step:786/1770 train_time:81602ms step_avg:103.82ms | |
| step:787/1770 train_time:81708ms step_avg:103.82ms | |
| step:788/1770 train_time:81815ms step_avg:103.83ms | |
| step:789/1770 train_time:81922ms step_avg:103.83ms | |
| step:790/1770 train_time:82029ms step_avg:103.83ms | |
| step:791/1770 train_time:82137ms step_avg:103.84ms | |
| step:792/1770 train_time:82244ms step_avg:103.84ms | |
| step:793/1770 train_time:82352ms step_avg:103.85ms | |
| step:794/1770 train_time:82459ms step_avg:103.85ms | |
| step:795/1770 train_time:82566ms step_avg:103.86ms | |
| step:796/1770 train_time:82673ms step_avg:103.86ms | |
| step:797/1770 train_time:82780ms step_avg:103.86ms | |
| step:798/1770 train_time:82887ms step_avg:103.87ms | |
| step:799/1770 train_time:82994ms step_avg:103.87ms | |
| step:800/1770 train_time:83103ms step_avg:103.88ms | |
| step:801/1770 train_time:83208ms step_avg:103.88ms | |
| step:802/1770 train_time:83314ms step_avg:103.88ms | |
| step:803/1770 train_time:83420ms step_avg:103.89ms | |
| step:804/1770 train_time:83527ms step_avg:103.89ms | |
| step:805/1770 train_time:83635ms step_avg:103.89ms | |
| step:806/1770 train_time:83741ms step_avg:103.90ms | |
| step:807/1770 train_time:83849ms step_avg:103.90ms | |
| step:808/1770 train_time:83956ms step_avg:103.91ms | |
| step:809/1770 train_time:84063ms step_avg:103.91ms | |
| step:810/1770 train_time:84171ms step_avg:103.91ms | |
| step:811/1770 train_time:84279ms step_avg:103.92ms | |
| step:812/1770 train_time:84386ms step_avg:103.92ms | |
| step:813/1770 train_time:84493ms step_avg:103.93ms | |
| step:814/1770 train_time:84600ms step_avg:103.93ms | |
| step:815/1770 train_time:84707ms step_avg:103.94ms | |
| step:816/1770 train_time:84814ms step_avg:103.94ms | |
| step:817/1770 train_time:84921ms step_avg:103.94ms | |
| step:818/1770 train_time:85028ms step_avg:103.95ms | |
| step:819/1770 train_time:85135ms step_avg:103.95ms | |
| step:820/1770 train_time:85242ms step_avg:103.95ms | |
| step:821/1770 train_time:85350ms step_avg:103.96ms | |
| step:822/1770 train_time:85457ms step_avg:103.96ms | |
| step:823/1770 train_time:85564ms step_avg:103.97ms | |
| step:824/1770 train_time:85672ms step_avg:103.97ms | |
| step:825/1770 train_time:85779ms step_avg:103.97ms | |
| step:826/1770 train_time:85886ms step_avg:103.98ms | |
| step:827/1770 train_time:85993ms step_avg:103.98ms | |
| step:828/1770 train_time:86099ms step_avg:103.98ms | |
| step:829/1770 train_time:86207ms step_avg:103.99ms | |
| step:830/1770 train_time:86313ms step_avg:103.99ms | |
| step:831/1770 train_time:86420ms step_avg:103.99ms | |
| step:832/1770 train_time:86527ms step_avg:104.00ms | |
| step:833/1770 train_time:86634ms step_avg:104.00ms | |
| step:834/1770 train_time:86741ms step_avg:104.01ms | |
| step:835/1770 train_time:86849ms step_avg:104.01ms | |
| step:836/1770 train_time:86957ms step_avg:104.02ms | |
| step:837/1770 train_time:87064ms step_avg:104.02ms | |
| step:838/1770 train_time:87171ms step_avg:104.02ms | |
| step:839/1770 train_time:87279ms step_avg:104.03ms | |
| step:840/1770 train_time:87388ms step_avg:104.03ms | |
| step:841/1770 train_time:87495ms step_avg:104.04ms | |
| step:842/1770 train_time:87603ms step_avg:104.04ms | |
| step:843/1770 train_time:87710ms step_avg:104.05ms | |
| step:844/1770 train_time:87818ms step_avg:104.05ms | |
| step:845/1770 train_time:87925ms step_avg:104.05ms | |
| step:846/1770 train_time:88033ms step_avg:104.06ms | |
| step:847/1770 train_time:88139ms step_avg:104.06ms | |
| step:848/1770 train_time:88246ms step_avg:104.06ms | |
| step:849/1770 train_time:88352ms step_avg:104.07ms | |
| step:850/1770 train_time:88459ms step_avg:104.07ms | |
| step:851/1770 train_time:88566ms step_avg:104.07ms | |
| step:852/1770 train_time:88673ms step_avg:104.08ms | |
| step:853/1770 train_time:88779ms step_avg:104.08ms | |
| step:854/1770 train_time:88886ms step_avg:104.08ms | |
| step:855/1770 train_time:88993ms step_avg:104.09ms | |
| step:856/1770 train_time:89100ms step_avg:104.09ms | |
| step:857/1770 train_time:89208ms step_avg:104.09ms | |
| step:858/1770 train_time:89314ms step_avg:104.10ms | |
| step:859/1770 train_time:89420ms step_avg:104.10ms | |
| step:860/1770 train_time:89527ms step_avg:104.10ms | |
| step:861/1770 train_time:89634ms step_avg:104.10ms | |
| step:862/1770 train_time:89741ms step_avg:104.11ms | |
| step:863/1770 train_time:89848ms step_avg:104.11ms | |
| step:864/1770 train_time:89954ms step_avg:104.11ms | |
| step:865/1770 train_time:90061ms step_avg:104.12ms | |
| step:866/1770 train_time:90170ms step_avg:104.12ms | |
| step:867/1770 train_time:90278ms step_avg:104.13ms | |
| step:868/1770 train_time:90386ms step_avg:104.13ms | |
| step:869/1770 train_time:90493ms step_avg:104.13ms | |
| step:870/1770 train_time:90601ms step_avg:104.14ms | |
| step:871/1770 train_time:90708ms step_avg:104.14ms | |
| step:872/1770 train_time:90814ms step_avg:104.14ms | |
| step:873/1770 train_time:90921ms step_avg:104.15ms | |
| step:874/1770 train_time:91028ms step_avg:104.15ms | |
| step:875/1770 train_time:91135ms step_avg:104.15ms | |
| step:875/1770 val_loss:4.2439 train_time:91139ms step_avg:104.16ms | |
| step:876/1770 train_time:91246ms step_avg:104.16ms | |
| step:877/1770 train_time:91354ms step_avg:104.17ms | |
| step:878/1770 train_time:91461ms step_avg:104.17ms | |
| step:879/1770 train_time:91567ms step_avg:104.17ms | |
| step:880/1770 train_time:91676ms step_avg:104.18ms | |
| step:881/1770 train_time:91783ms step_avg:104.18ms | |
| step:882/1770 train_time:91889ms step_avg:104.18ms | |
| step:883/1770 train_time:91997ms step_avg:104.19ms | |
| step:884/1770 train_time:92104ms step_avg:104.19ms | |
| step:885/1770 train_time:92210ms step_avg:104.19ms | |
| step:886/1770 train_time:92318ms step_avg:104.20ms | |
| step:887/1770 train_time:92426ms step_avg:104.20ms | |
| step:888/1770 train_time:92532ms step_avg:104.20ms | |
| step:889/1770 train_time:92639ms step_avg:104.21ms | |
| step:890/1770 train_time:92745ms step_avg:104.21ms | |
| step:891/1770 train_time:92852ms step_avg:104.21ms | |
| step:892/1770 train_time:92959ms step_avg:104.21ms | |
| step:893/1770 train_time:93067ms step_avg:104.22ms | |
| step:894/1770 train_time:93174ms step_avg:104.22ms | |
| step:895/1770 train_time:93281ms step_avg:104.22ms | |
| step:896/1770 train_time:93388ms step_avg:104.23ms | |
| step:897/1770 train_time:93495ms step_avg:104.23ms | |
| step:898/1770 train_time:93603ms step_avg:104.23ms | |
| step:899/1770 train_time:93710ms step_avg:104.24ms | |
| step:900/1770 train_time:93817ms step_avg:104.24ms | |
| step:901/1770 train_time:93924ms step_avg:104.24ms | |
| step:902/1770 train_time:94031ms step_avg:104.25ms | |
| step:903/1770 train_time:94138ms step_avg:104.25ms | |
| step:904/1770 train_time:94245ms step_avg:104.25ms | |
| step:905/1770 train_time:94352ms step_avg:104.26ms | |
| step:906/1770 train_time:94459ms step_avg:104.26ms | |
| step:907/1770 train_time:94566ms step_avg:104.26ms | |
| step:908/1770 train_time:94675ms step_avg:104.27ms | |
| step:909/1770 train_time:94779ms step_avg:104.27ms | |
| step:910/1770 train_time:94887ms step_avg:104.27ms | |
| step:911/1770 train_time:94999ms step_avg:104.28ms | |
| step:912/1770 train_time:95103ms step_avg:104.28ms | |
| step:913/1770 train_time:95210ms step_avg:104.28ms | |
| step:914/1770 train_time:95317ms step_avg:104.29ms | |
| step:915/1770 train_time:95424ms step_avg:104.29ms | |
| step:916/1770 train_time:95531ms step_avg:104.29ms | |
| step:917/1770 train_time:95638ms step_avg:104.29ms | |
| step:918/1770 train_time:95745ms step_avg:104.30ms | |
| step:919/1770 train_time:95853ms step_avg:104.30ms | |
| step:920/1770 train_time:95961ms step_avg:104.31ms | |
| step:921/1770 train_time:96070ms step_avg:104.31ms | |
| step:922/1770 train_time:96178ms step_avg:104.31ms | |
| step:923/1770 train_time:96286ms step_avg:104.32ms | |
| step:924/1770 train_time:96395ms step_avg:104.32ms | |
| step:925/1770 train_time:96502ms step_avg:104.33ms | |
| step:926/1770 train_time:96610ms step_avg:104.33ms | |
| step:927/1770 train_time:96719ms step_avg:104.34ms | |
| step:928/1770 train_time:96827ms step_avg:104.34ms | |
| step:929/1770 train_time:96936ms step_avg:104.34ms | |
| step:930/1770 train_time:97045ms step_avg:104.35ms | |
| step:931/1770 train_time:97153ms step_avg:104.35ms | |
| step:932/1770 train_time:97262ms step_avg:104.36ms | |
| step:933/1770 train_time:97371ms step_avg:104.36ms | |
| step:934/1770 train_time:97480ms step_avg:104.37ms | |
| step:935/1770 train_time:97587ms step_avg:104.37ms | |
| step:936/1770 train_time:97696ms step_avg:104.38ms | |
| step:937/1770 train_time:97805ms step_avg:104.38ms | |
| step:938/1770 train_time:97913ms step_avg:104.38ms | |
| step:939/1770 train_time:98022ms step_avg:104.39ms | |
| step:940/1770 train_time:98131ms step_avg:104.39ms | |
| step:941/1770 train_time:98240ms step_avg:104.40ms | |
| step:942/1770 train_time:98349ms step_avg:104.40ms | |
| step:943/1770 train_time:98460ms step_avg:104.41ms | |
| step:944/1770 train_time:98568ms step_avg:104.42ms | |
| step:945/1770 train_time:98676ms step_avg:104.42ms | |
| step:946/1770 train_time:98785ms step_avg:104.42ms | |
| step:947/1770 train_time:98892ms step_avg:104.43ms | |
| step:948/1770 train_time:99002ms step_avg:104.43ms | |
| step:949/1770 train_time:99110ms step_avg:104.44ms | |
| step:950/1770 train_time:99218ms step_avg:104.44ms | |
| step:951/1770 train_time:99326ms step_avg:104.44ms | |
| step:952/1770 train_time:99434ms step_avg:104.45ms | |
| step:953/1770 train_time:99543ms step_avg:104.45ms | |
| step:954/1770 train_time:99652ms step_avg:104.46ms | |
| step:955/1770 train_time:99761ms step_avg:104.46ms | |
| step:956/1770 train_time:99870ms step_avg:104.47ms | |
| step:957/1770 train_time:99979ms step_avg:104.47ms | |
| step:958/1770 train_time:100086ms step_avg:104.47ms | |
| step:959/1770 train_time:100195ms step_avg:104.48ms | |
| step:960/1770 train_time:100305ms step_avg:104.48ms | |
| step:961/1770 train_time:100413ms step_avg:104.49ms | |
| step:962/1770 train_time:100521ms step_avg:104.49ms | |
| step:963/1770 train_time:100629ms step_avg:104.50ms | |
| step:964/1770 train_time:100738ms step_avg:104.50ms | |
| step:965/1770 train_time:100847ms step_avg:104.50ms | |
| step:966/1770 train_time:100954ms step_avg:104.51ms | |
| step:967/1770 train_time:101062ms step_avg:104.51ms | |
| step:968/1770 train_time:101171ms step_avg:104.52ms | |
| step:969/1770 train_time:101279ms step_avg:104.52ms | |
| step:970/1770 train_time:101388ms step_avg:104.52ms | |
| step:971/1770 train_time:101498ms step_avg:104.53ms | |
| step:972/1770 train_time:101604ms step_avg:104.53ms | |
| step:973/1770 train_time:101713ms step_avg:104.54ms | |
| step:974/1770 train_time:101822ms step_avg:104.54ms | |
| step:975/1770 train_time:101930ms step_avg:104.54ms | |
| step:976/1770 train_time:102038ms step_avg:104.55ms | |
| step:977/1770 train_time:102147ms step_avg:104.55ms | |
| step:978/1770 train_time:102255ms step_avg:104.56ms | |
| step:979/1770 train_time:102364ms step_avg:104.56ms | |
| step:980/1770 train_time:102472ms step_avg:104.56ms | |
| step:981/1770 train_time:102587ms step_avg:104.57ms | |
| step:982/1770 train_time:102694ms step_avg:104.58ms | |
| step:983/1770 train_time:102801ms step_avg:104.58ms | |
| step:984/1770 train_time:102909ms step_avg:104.58ms | |
| step:985/1770 train_time:103018ms step_avg:104.59ms | |
| step:986/1770 train_time:103128ms step_avg:104.59ms | |
| step:987/1770 train_time:103233ms step_avg:104.59ms | |
| step:988/1770 train_time:103343ms step_avg:104.60ms | |
| step:989/1770 train_time:103451ms step_avg:104.60ms | |
| step:990/1770 train_time:103560ms step_avg:104.61ms | |
| step:991/1770 train_time:103669ms step_avg:104.61ms | |
| step:992/1770 train_time:103777ms step_avg:104.61ms | |
| step:993/1770 train_time:103885ms step_avg:104.62ms | |
| step:994/1770 train_time:103993ms step_avg:104.62ms | |
| step:995/1770 train_time:104101ms step_avg:104.62ms | |
| step:996/1770 train_time:104209ms step_avg:104.63ms | |
| step:997/1770 train_time:104317ms step_avg:104.63ms | |
| step:998/1770 train_time:104425ms step_avg:104.63ms | |
| step:999/1770 train_time:104533ms step_avg:104.64ms | |
| step:1000/1770 train_time:104642ms step_avg:104.64ms | |
| step:1000/1770 val_loss:4.1744 train_time:104646ms step_avg:104.65ms | |
| step:1001/1770 train_time:104756ms step_avg:104.65ms | |
| step:1002/1770 train_time:104864ms step_avg:104.65ms | |
| step:1003/1770 train_time:104973ms step_avg:104.66ms | |
| step:1004/1770 train_time:105087ms step_avg:104.67ms | |
| step:1005/1770 train_time:105192ms step_avg:104.67ms | |
| step:1006/1770 train_time:105301ms step_avg:104.67ms | |
| step:1007/1770 train_time:105409ms step_avg:104.68ms | |
| step:1008/1770 train_time:105517ms step_avg:104.68ms | |
| step:1009/1770 train_time:105625ms step_avg:104.68ms | |
| step:1010/1770 train_time:105734ms step_avg:104.69ms | |
| step:1011/1770 train_time:105842ms step_avg:104.69ms | |
| step:1012/1770 train_time:105950ms step_avg:104.69ms | |
| step:1013/1770 train_time:106058ms step_avg:104.70ms | |
| step:1014/1770 train_time:106165ms step_avg:104.70ms | |
| step:1015/1770 train_time:106273ms step_avg:104.70ms | |
| step:1016/1770 train_time:106382ms step_avg:104.71ms | |
| step:1017/1770 train_time:106491ms step_avg:104.71ms | |
| step:1018/1770 train_time:106599ms step_avg:104.71ms | |
| step:1019/1770 train_time:106707ms step_avg:104.72ms | |
| step:1020/1770 train_time:106816ms step_avg:104.72ms | |
| step:1021/1770 train_time:106929ms step_avg:104.73ms | |
| step:1022/1770 train_time:107035ms step_avg:104.73ms | |
| step:1023/1770 train_time:107143ms step_avg:104.73ms | |
| step:1024/1770 train_time:107252ms step_avg:104.74ms | |
| step:1025/1770 train_time:107359ms step_avg:104.74ms | |
| step:1026/1770 train_time:107468ms step_avg:104.74ms | |
| step:1027/1770 train_time:107576ms step_avg:104.75ms | |
| step:1028/1770 train_time:107684ms step_avg:104.75ms | |
| step:1029/1770 train_time:107793ms step_avg:104.76ms | |
| step:1030/1770 train_time:107903ms step_avg:104.76ms | |
| step:1031/1770 train_time:108010ms step_avg:104.76ms | |
| step:1032/1770 train_time:108122ms step_avg:104.77ms | |
| step:1033/1770 train_time:108228ms step_avg:104.77ms | |
| step:1034/1770 train_time:108336ms step_avg:104.77ms | |
| step:1035/1770 train_time:108444ms step_avg:104.78ms | |
| step:1036/1770 train_time:108552ms step_avg:104.78ms | |
| step:1037/1770 train_time:108663ms step_avg:104.79ms | |
| step:1038/1770 train_time:108770ms step_avg:104.79ms | |
| step:1039/1770 train_time:108878ms step_avg:104.79ms | |
| step:1040/1770 train_time:108987ms step_avg:104.79ms | |
| step:1041/1770 train_time:109095ms step_avg:104.80ms | |
| step:1042/1770 train_time:109204ms step_avg:104.80ms | |
| step:1043/1770 train_time:109312ms step_avg:104.81ms | |
| step:1044/1770 train_time:109420ms step_avg:104.81ms | |
| step:1045/1770 train_time:109528ms step_avg:104.81ms | |
| step:1046/1770 train_time:109636ms step_avg:104.81ms | |
| step:1047/1770 train_time:109744ms step_avg:104.82ms | |
| step:1048/1770 train_time:109853ms step_avg:104.82ms | |
| step:1049/1770 train_time:109961ms step_avg:104.82ms | |
| step:1050/1770 train_time:110071ms step_avg:104.83ms | |
| step:1051/1770 train_time:110180ms step_avg:104.83ms | |
| step:1052/1770 train_time:110289ms step_avg:104.84ms | |
| step:1053/1770 train_time:110397ms step_avg:104.84ms | |
| step:1054/1770 train_time:110505ms step_avg:104.84ms | |
| step:1055/1770 train_time:110614ms step_avg:104.85ms | |
| step:1056/1770 train_time:110722ms step_avg:104.85ms | |
| step:1057/1770 train_time:110831ms step_avg:104.85ms | |
| step:1058/1770 train_time:110939ms step_avg:104.86ms | |
| step:1059/1770 train_time:111048ms step_avg:104.86ms | |
| step:1060/1770 train_time:111156ms step_avg:104.86ms | |
| step:1061/1770 train_time:111265ms step_avg:104.87ms | |
| step:1062/1770 train_time:111373ms step_avg:104.87ms | |
| step:1063/1770 train_time:111482ms step_avg:104.88ms | |
| step:1064/1770 train_time:111591ms step_avg:104.88ms | |
| step:1065/1770 train_time:111701ms step_avg:104.88ms | |
| step:1066/1770 train_time:111812ms step_avg:104.89ms | |
| step:1067/1770 train_time:111922ms step_avg:104.89ms | |
| step:1068/1770 train_time:112033ms step_avg:104.90ms | |
| step:1069/1770 train_time:112139ms step_avg:104.90ms | |
| step:1070/1770 train_time:112247ms step_avg:104.90ms | |
| step:1071/1770 train_time:112357ms step_avg:104.91ms | |
| step:1072/1770 train_time:112466ms step_avg:104.91ms | |
| step:1073/1770 train_time:112576ms step_avg:104.92ms | |
| step:1074/1770 train_time:112684ms step_avg:104.92ms | |
| step:1075/1770 train_time:112793ms step_avg:104.92ms | |
| step:1076/1770 train_time:112902ms step_avg:104.93ms | |
| step:1077/1770 train_time:113011ms step_avg:104.93ms | |
| step:1078/1770 train_time:113119ms step_avg:104.93ms | |
| step:1079/1770 train_time:113230ms step_avg:104.94ms | |
| step:1080/1770 train_time:113336ms step_avg:104.94ms | |
| step:1081/1770 train_time:113444ms step_avg:104.94ms | |
| step:1082/1770 train_time:113553ms step_avg:104.95ms | |
| step:1083/1770 train_time:113662ms step_avg:104.95ms | |
| step:1084/1770 train_time:113771ms step_avg:104.95ms | |
| step:1085/1770 train_time:113881ms step_avg:104.96ms | |
| step:1086/1770 train_time:113990ms step_avg:104.96ms | |
| step:1087/1770 train_time:114098ms step_avg:104.97ms | |
| step:1088/1770 train_time:114206ms step_avg:104.97ms | |
| step:1089/1770 train_time:114315ms step_avg:104.97ms | |
| step:1090/1770 train_time:114423ms step_avg:104.98ms | |
| step:1091/1770 train_time:114532ms step_avg:104.98ms | |
| step:1092/1770 train_time:114643ms step_avg:104.98ms | |
| step:1093/1770 train_time:114751ms step_avg:104.99ms | |
| step:1094/1770 train_time:114859ms step_avg:104.99ms | |
| step:1095/1770 train_time:114969ms step_avg:104.99ms | |
| step:1096/1770 train_time:115077ms step_avg:105.00ms | |
| step:1097/1770 train_time:115185ms step_avg:105.00ms | |
| step:1098/1770 train_time:115294ms step_avg:105.00ms | |
| step:1099/1770 train_time:115403ms step_avg:105.01ms | |
| step:1100/1770 train_time:115512ms step_avg:105.01ms | |
| step:1101/1770 train_time:115621ms step_avg:105.01ms | |
| step:1102/1770 train_time:115733ms step_avg:105.02ms | |
| step:1103/1770 train_time:115840ms step_avg:105.02ms | |
| step:1104/1770 train_time:115948ms step_avg:105.03ms | |
| step:1105/1770 train_time:116057ms step_avg:105.03ms | |
| step:1106/1770 train_time:116165ms step_avg:105.03ms | |
| step:1107/1770 train_time:116275ms step_avg:105.04ms | |
| step:1108/1770 train_time:116383ms step_avg:105.04ms | |
| step:1109/1770 train_time:116492ms step_avg:105.04ms | |
| step:1110/1770 train_time:116600ms step_avg:105.05ms | |
| step:1111/1770 train_time:116709ms step_avg:105.05ms | |
| step:1112/1770 train_time:116817ms step_avg:105.05ms | |
| step:1113/1770 train_time:116926ms step_avg:105.05ms | |
| step:1114/1770 train_time:117034ms step_avg:105.06ms | |
| step:1115/1770 train_time:117142ms step_avg:105.06ms | |
| step:1116/1770 train_time:117250ms step_avg:105.06ms | |
| step:1117/1770 train_time:117358ms step_avg:105.07ms | |
| step:1118/1770 train_time:117467ms step_avg:105.07ms | |
| step:1119/1770 train_time:117575ms step_avg:105.07ms | |
| step:1120/1770 train_time:117685ms step_avg:105.08ms | |
| step:1121/1770 train_time:117794ms step_avg:105.08ms | |
| step:1122/1770 train_time:117904ms step_avg:105.08ms | |
| step:1123/1770 train_time:118014ms step_avg:105.09ms | |
| step:1124/1770 train_time:118122ms step_avg:105.09ms | |
| step:1125/1770 train_time:118231ms step_avg:105.09ms | |
| step:1125/1770 val_loss:4.1146 train_time:118235ms step_avg:105.10ms | |
| step:1126/1770 train_time:118347ms step_avg:105.10ms | |
| step:1127/1770 train_time:118455ms step_avg:105.11ms | |
| step:1128/1770 train_time:118565ms step_avg:105.11ms | |
| step:1129/1770 train_time:118674ms step_avg:105.11ms | |
| step:1130/1770 train_time:118783ms step_avg:105.12ms | |
| step:1131/1770 train_time:118891ms step_avg:105.12ms | |
| step:1132/1770 train_time:118999ms step_avg:105.12ms | |
| step:1133/1770 train_time:119108ms step_avg:105.13ms | |
| step:1134/1770 train_time:119216ms step_avg:105.13ms | |
| step:1135/1770 train_time:119326ms step_avg:105.13ms | |
| step:1136/1770 train_time:119435ms step_avg:105.14ms | |
| step:1137/1770 train_time:119544ms step_avg:105.14ms | |
| step:1138/1770 train_time:119652ms step_avg:105.14ms | |
| step:1139/1770 train_time:119761ms step_avg:105.15ms | |
| step:1140/1770 train_time:119869ms step_avg:105.15ms | |
| step:1141/1770 train_time:119978ms step_avg:105.15ms | |
| step:1142/1770 train_time:120088ms step_avg:105.16ms | |
| step:1143/1770 train_time:120197ms step_avg:105.16ms | |
| step:1144/1770 train_time:120306ms step_avg:105.16ms | |
| step:1145/1770 train_time:120414ms step_avg:105.17ms | |
| step:1146/1770 train_time:120523ms step_avg:105.17ms | |
| step:1147/1770 train_time:120632ms step_avg:105.17ms | |
| step:1148/1770 train_time:120743ms step_avg:105.18ms | |
| step:1149/1770 train_time:120849ms step_avg:105.18ms | |
| step:1150/1770 train_time:120957ms step_avg:105.18ms | |
| step:1151/1770 train_time:121065ms step_avg:105.18ms | |
| step:1152/1770 train_time:121174ms step_avg:105.19ms | |
| step:1153/1770 train_time:121282ms step_avg:105.19ms | |
| step:1154/1770 train_time:121390ms step_avg:105.19ms | |
| step:1155/1770 train_time:121500ms step_avg:105.19ms | |
| step:1156/1770 train_time:121608ms step_avg:105.20ms | |
| step:1157/1770 train_time:121717ms step_avg:105.20ms | |
| step:1158/1770 train_time:121824ms step_avg:105.20ms | |
| step:1159/1770 train_time:121933ms step_avg:105.21ms | |
| step:1160/1770 train_time:122043ms step_avg:105.21ms | |
| step:1161/1770 train_time:122150ms step_avg:105.21ms | |
| step:1162/1770 train_time:122258ms step_avg:105.21ms | |
| step:1163/1770 train_time:122367ms step_avg:105.22ms | |
| step:1164/1770 train_time:122476ms step_avg:105.22ms | |
| step:1165/1770 train_time:122585ms step_avg:105.22ms | |
| step:1166/1770 train_time:122694ms step_avg:105.23ms | |
| step:1167/1770 train_time:122803ms step_avg:105.23ms | |
| step:1168/1770 train_time:122912ms step_avg:105.23ms | |
| step:1169/1770 train_time:123022ms step_avg:105.24ms | |
| step:1170/1770 train_time:123130ms step_avg:105.24ms | |
| step:1171/1770 train_time:123240ms step_avg:105.24ms | |
| step:1172/1770 train_time:123350ms step_avg:105.25ms | |
| step:1173/1770 train_time:123456ms step_avg:105.25ms | |
| step:1174/1770 train_time:123566ms step_avg:105.25ms | |
| step:1175/1770 train_time:123674ms step_avg:105.25ms | |
| step:1176/1770 train_time:123782ms step_avg:105.26ms | |
| step:1177/1770 train_time:123891ms step_avg:105.26ms | |
| step:1178/1770 train_time:124000ms step_avg:105.26ms | |
| step:1179/1770 train_time:124109ms step_avg:105.27ms | |
| step:1180/1770 train_time:124220ms step_avg:105.27ms | |
| step:1181/1770 train_time:124326ms step_avg:105.27ms | |
| step:1182/1770 train_time:124436ms step_avg:105.28ms | |
| step:1183/1770 train_time:124546ms step_avg:105.28ms | |
| step:1184/1770 train_time:124656ms step_avg:105.28ms | |
| step:1185/1770 train_time:124767ms step_avg:105.29ms | |
| step:1186/1770 train_time:124878ms step_avg:105.29ms | |
| step:1187/1770 train_time:124989ms step_avg:105.30ms | |
| step:1188/1770 train_time:125099ms step_avg:105.30ms | |
| step:1189/1770 train_time:125209ms step_avg:105.31ms | |
| step:1190/1770 train_time:125320ms step_avg:105.31ms | |
| step:1191/1770 train_time:125430ms step_avg:105.31ms | |
| step:1192/1770 train_time:125539ms step_avg:105.32ms | |
| step:1193/1770 train_time:125651ms step_avg:105.32ms | |
| step:1194/1770 train_time:125757ms step_avg:105.32ms | |
| step:1195/1770 train_time:125867ms step_avg:105.33ms | |
| step:1196/1770 train_time:125977ms step_avg:105.33ms | |
| step:1197/1770 train_time:126086ms step_avg:105.34ms | |
| step:1198/1770 train_time:126196ms step_avg:105.34ms | |
| step:1199/1770 train_time:126306ms step_avg:105.34ms | |
| step:1200/1770 train_time:126416ms step_avg:105.35ms | |
| step:1201/1770 train_time:126528ms step_avg:105.35ms | |
| step:1202/1770 train_time:126639ms step_avg:105.36ms | |
| step:1203/1770 train_time:126748ms step_avg:105.36ms | |
| step:1204/1770 train_time:126858ms step_avg:105.36ms | |
| step:1205/1770 train_time:126968ms step_avg:105.37ms | |
| step:1206/1770 train_time:127079ms step_avg:105.37ms | |
| step:1207/1770 train_time:127189ms step_avg:105.38ms | |
| step:1208/1770 train_time:127299ms step_avg:105.38ms | |
| step:1209/1770 train_time:127409ms step_avg:105.38ms | |
| step:1210/1770 train_time:127520ms step_avg:105.39ms | |
| step:1211/1770 train_time:127630ms step_avg:105.39ms | |
| step:1212/1770 train_time:127739ms step_avg:105.40ms | |
| step:1213/1770 train_time:127849ms step_avg:105.40ms | |
| step:1214/1770 train_time:127958ms step_avg:105.40ms | |
| step:1215/1770 train_time:128067ms step_avg:105.40ms | |
| step:1216/1770 train_time:128177ms step_avg:105.41ms | |
| step:1217/1770 train_time:128286ms step_avg:105.41ms | |
| step:1218/1770 train_time:128395ms step_avg:105.41ms | |
| step:1219/1770 train_time:128506ms step_avg:105.42ms | |
| step:1220/1770 train_time:128617ms step_avg:105.42ms | |
| step:1221/1770 train_time:128727ms step_avg:105.43ms | |
| step:1222/1770 train_time:128838ms step_avg:105.43ms | |
| step:1223/1770 train_time:128948ms step_avg:105.44ms | |
| step:1224/1770 train_time:129058ms step_avg:105.44ms | |
| step:1225/1770 train_time:129168ms step_avg:105.44ms | |
| step:1226/1770 train_time:129279ms step_avg:105.45ms | |
| step:1227/1770 train_time:129390ms step_avg:105.45ms | |
| step:1228/1770 train_time:129503ms step_avg:105.46ms | |
| step:1229/1770 train_time:129612ms step_avg:105.46ms | |
| step:1230/1770 train_time:129720ms step_avg:105.46ms | |
| step:1231/1770 train_time:129830ms step_avg:105.47ms | |
| step:1232/1770 train_time:129940ms step_avg:105.47ms | |
| step:1233/1770 train_time:130050ms step_avg:105.47ms | |
| step:1234/1770 train_time:130159ms step_avg:105.48ms | |
| step:1235/1770 train_time:130267ms step_avg:105.48ms | |
| step:1236/1770 train_time:130376ms step_avg:105.48ms | |
| step:1237/1770 train_time:130486ms step_avg:105.49ms | |
| step:1238/1770 train_time:130596ms step_avg:105.49ms | |
| step:1239/1770 train_time:130706ms step_avg:105.49ms | |
| step:1240/1770 train_time:130816ms step_avg:105.50ms | |
| step:1241/1770 train_time:130927ms step_avg:105.50ms | |
| step:1242/1770 train_time:131036ms step_avg:105.50ms | |
| step:1243/1770 train_time:131146ms step_avg:105.51ms | |
| step:1244/1770 train_time:131256ms step_avg:105.51ms | |
| step:1245/1770 train_time:131367ms step_avg:105.52ms | |
| step:1246/1770 train_time:131478ms step_avg:105.52ms | |
| step:1247/1770 train_time:131589ms step_avg:105.52ms | |
| step:1248/1770 train_time:131699ms step_avg:105.53ms | |
| step:1249/1770 train_time:131811ms step_avg:105.53ms | |
| step:1250/1770 train_time:131921ms step_avg:105.54ms | |
| step:1250/1770 val_loss:4.0342 train_time:131924ms step_avg:105.54ms | |
| step:1251/1770 train_time:132036ms step_avg:105.54ms | |
| step:1252/1770 train_time:132146ms step_avg:105.55ms | |
| step:1253/1770 train_time:132256ms step_avg:105.55ms | |
| step:1254/1770 train_time:132366ms step_avg:105.55ms | |
| step:1255/1770 train_time:132475ms step_avg:105.56ms | |
| step:1256/1770 train_time:132584ms step_avg:105.56ms | |
| step:1257/1770 train_time:132693ms step_avg:105.56ms | |
| step:1258/1770 train_time:132802ms step_avg:105.57ms | |
| step:1259/1770 train_time:132912ms step_avg:105.57ms | |
| step:1260/1770 train_time:133022ms step_avg:105.57ms | |
| step:1261/1770 train_time:133132ms step_avg:105.58ms | |
| step:1262/1770 train_time:133242ms step_avg:105.58ms | |
| step:1263/1770 train_time:133352ms step_avg:105.58ms | |
| step:1264/1770 train_time:133464ms step_avg:105.59ms | |
| step:1265/1770 train_time:133572ms step_avg:105.59ms | |
| step:1266/1770 train_time:133682ms step_avg:105.59ms | |
| step:1267/1770 train_time:133791ms step_avg:105.60ms | |
| step:1268/1770 train_time:133901ms step_avg:105.60ms | |
| step:1269/1770 train_time:134011ms step_avg:105.60ms | |
| step:1270/1770 train_time:134120ms step_avg:105.61ms | |
| step:1271/1770 train_time:134231ms step_avg:105.61ms | |
| step:1272/1770 train_time:134341ms step_avg:105.61ms | |
| step:1273/1770 train_time:134450ms step_avg:105.62ms | |
| step:1274/1770 train_time:134559ms step_avg:105.62ms | |
| step:1275/1770 train_time:134669ms step_avg:105.62ms | |
| step:1276/1770 train_time:134778ms step_avg:105.63ms | |
| step:1277/1770 train_time:134888ms step_avg:105.63ms | |
| step:1278/1770 train_time:134999ms step_avg:105.63ms | |
| step:1279/1770 train_time:135109ms step_avg:105.64ms | |
| step:1280/1770 train_time:135218ms step_avg:105.64ms | |
| step:1281/1770 train_time:135329ms step_avg:105.64ms | |
| step:1282/1770 train_time:135440ms step_avg:105.65ms | |
| step:1283/1770 train_time:135551ms step_avg:105.65ms | |
| step:1284/1770 train_time:135660ms step_avg:105.65ms | |
| step:1285/1770 train_time:135771ms step_avg:105.66ms | |
| step:1286/1770 train_time:135881ms step_avg:105.66ms | |
| step:1287/1770 train_time:135990ms step_avg:105.66ms | |
| step:1288/1770 train_time:136099ms step_avg:105.67ms | |
| step:1289/1770 train_time:136211ms step_avg:105.67ms | |
| step:1290/1770 train_time:136319ms step_avg:105.67ms | |
| step:1291/1770 train_time:136430ms step_avg:105.68ms | |
| step:1292/1770 train_time:136539ms step_avg:105.68ms | |
| step:1293/1770 train_time:136649ms step_avg:105.68ms | |
| step:1294/1770 train_time:136760ms step_avg:105.69ms | |
| step:1295/1770 train_time:136869ms step_avg:105.69ms | |
| step:1296/1770 train_time:136979ms step_avg:105.69ms | |
| step:1297/1770 train_time:137089ms step_avg:105.70ms | |
| step:1298/1770 train_time:137200ms step_avg:105.70ms | |
| step:1299/1770 train_time:137309ms step_avg:105.70ms | |
| step:1300/1770 train_time:137419ms step_avg:105.71ms | |
| step:1301/1770 train_time:137530ms step_avg:105.71ms | |
| step:1302/1770 train_time:137640ms step_avg:105.71ms | |
| step:1303/1770 train_time:137751ms step_avg:105.72ms | |
| step:1304/1770 train_time:137862ms step_avg:105.72ms | |
| step:1305/1770 train_time:137972ms step_avg:105.73ms | |
| step:1306/1770 train_time:138080ms step_avg:105.73ms | |
| step:1307/1770 train_time:138190ms step_avg:105.73ms | |
| step:1308/1770 train_time:138301ms step_avg:105.73ms | |
| step:1309/1770 train_time:138412ms step_avg:105.74ms | |
| step:1310/1770 train_time:138522ms step_avg:105.74ms | |
| step:1311/1770 train_time:138632ms step_avg:105.75ms | |
| step:1312/1770 train_time:138739ms step_avg:105.75ms | |
| step:1313/1770 train_time:138850ms step_avg:105.75ms | |
| step:1314/1770 train_time:138960ms step_avg:105.75ms | |
| step:1315/1770 train_time:139070ms step_avg:105.76ms | |
| step:1316/1770 train_time:139180ms step_avg:105.76ms | |
| step:1317/1770 train_time:139290ms step_avg:105.76ms | |
| step:1318/1770 train_time:139400ms step_avg:105.77ms | |
| step:1319/1770 train_time:139509ms step_avg:105.77ms | |
| step:1320/1770 train_time:139620ms step_avg:105.77ms | |
| step:1321/1770 train_time:139730ms step_avg:105.78ms | |
| step:1322/1770 train_time:139840ms step_avg:105.78ms | |
| step:1323/1770 train_time:139950ms step_avg:105.78ms | |
| step:1324/1770 train_time:140060ms step_avg:105.79ms | |
| step:1325/1770 train_time:140170ms step_avg:105.79ms | |
| step:1326/1770 train_time:140284ms step_avg:105.79ms | |
| step:1327/1770 train_time:140392ms step_avg:105.80ms | |
| step:1328/1770 train_time:140500ms step_avg:105.80ms | |
| step:1329/1770 train_time:140612ms step_avg:105.80ms | |
| step:1330/1770 train_time:140721ms step_avg:105.80ms | |
| step:1331/1770 train_time:140831ms step_avg:105.81ms | |
| step:1332/1770 train_time:140941ms step_avg:105.81ms | |
| step:1333/1770 train_time:141052ms step_avg:105.82ms | |
| step:1334/1770 train_time:141162ms step_avg:105.82ms | |
| step:1335/1770 train_time:141275ms step_avg:105.82ms | |
| step:1336/1770 train_time:141382ms step_avg:105.82ms | |
| step:1337/1770 train_time:141492ms step_avg:105.83ms | |
| step:1338/1770 train_time:141602ms step_avg:105.83ms | |
| step:1339/1770 train_time:141711ms step_avg:105.83ms | |
| step:1340/1770 train_time:141822ms step_avg:105.84ms | |
| step:1341/1770 train_time:141932ms step_avg:105.84ms | |
| step:1342/1770 train_time:142042ms step_avg:105.84ms | |
| step:1343/1770 train_time:142152ms step_avg:105.85ms | |
| step:1344/1770 train_time:142261ms step_avg:105.85ms | |
| step:1345/1770 train_time:142375ms step_avg:105.85ms | |
| step:1346/1770 train_time:142481ms step_avg:105.86ms | |
| step:1347/1770 train_time:142591ms step_avg:105.86ms | |
| step:1348/1770 train_time:142701ms step_avg:105.86ms | |
| step:1349/1770 train_time:142811ms step_avg:105.86ms | |
| step:1350/1770 train_time:142920ms step_avg:105.87ms | |
| step:1351/1770 train_time:143030ms step_avg:105.87ms | |
| step:1352/1770 train_time:143142ms step_avg:105.87ms | |
| step:1353/1770 train_time:143253ms step_avg:105.88ms | |
| step:1354/1770 train_time:143363ms step_avg:105.88ms | |
| step:1355/1770 train_time:143473ms step_avg:105.88ms | |
| step:1356/1770 train_time:143582ms step_avg:105.89ms | |
| step:1357/1770 train_time:143692ms step_avg:105.89ms | |
| step:1358/1770 train_time:143803ms step_avg:105.89ms | |
| step:1359/1770 train_time:143916ms step_avg:105.90ms | |
| step:1360/1770 train_time:144022ms step_avg:105.90ms | |
| step:1361/1770 train_time:144132ms step_avg:105.90ms | |
| step:1362/1770 train_time:144241ms step_avg:105.90ms | |
| step:1363/1770 train_time:144353ms step_avg:105.91ms | |
| step:1364/1770 train_time:144464ms step_avg:105.91ms | |
| step:1365/1770 train_time:144575ms step_avg:105.92ms | |
| step:1366/1770 train_time:144684ms step_avg:105.92ms | |
| step:1367/1770 train_time:144794ms step_avg:105.92ms | |
| step:1368/1770 train_time:144905ms step_avg:105.92ms | |
| step:1369/1770 train_time:145015ms step_avg:105.93ms | |
| step:1370/1770 train_time:145124ms step_avg:105.93ms | |
| step:1371/1770 train_time:145234ms step_avg:105.93ms | |
| step:1372/1770 train_time:145345ms step_avg:105.94ms | |
| step:1373/1770 train_time:145455ms step_avg:105.94ms | |
| step:1374/1770 train_time:145565ms step_avg:105.94ms | |
| step:1375/1770 train_time:145677ms step_avg:105.95ms | |
| step:1375/1770 val_loss:3.9621 train_time:145681ms step_avg:105.95ms | |
| step:1376/1770 train_time:145792ms step_avg:105.95ms | |
| step:1377/1770 train_time:145904ms step_avg:105.96ms | |
| step:1378/1770 train_time:146014ms step_avg:105.96ms | |
| step:1379/1770 train_time:146124ms step_avg:105.96ms | |
| step:1380/1770 train_time:146234ms step_avg:105.97ms | |
| step:1381/1770 train_time:146346ms step_avg:105.97ms | |
| step:1382/1770 train_time:146458ms step_avg:105.98ms | |
| step:1383/1770 train_time:146568ms step_avg:105.98ms | |
| step:1384/1770 train_time:146679ms step_avg:105.98ms | |
| step:1385/1770 train_time:146789ms step_avg:105.99ms | |
| step:1386/1770 train_time:146900ms step_avg:105.99ms | |
| step:1387/1770 train_time:147011ms step_avg:105.99ms | |
| step:1388/1770 train_time:147121ms step_avg:105.99ms | |
| step:1389/1770 train_time:147232ms step_avg:106.00ms | |
| step:1390/1770 train_time:147342ms step_avg:106.00ms | |
| step:1391/1770 train_time:147452ms step_avg:106.00ms | |
| step:1392/1770 train_time:147563ms step_avg:106.01ms | |
| step:1393/1770 train_time:147672ms step_avg:106.01ms | |
| step:1394/1770 train_time:147782ms step_avg:106.01ms | |
| step:1395/1770 train_time:147897ms step_avg:106.02ms | |
| step:1396/1770 train_time:148007ms step_avg:106.02ms | |
| step:1397/1770 train_time:148117ms step_avg:106.03ms | |
| step:1398/1770 train_time:148226ms step_avg:106.03ms | |
| step:1399/1770 train_time:148338ms step_avg:106.03ms | |
| step:1400/1770 train_time:148448ms step_avg:106.03ms | |
| step:1401/1770 train_time:148558ms step_avg:106.04ms | |
| step:1402/1770 train_time:148670ms step_avg:106.04ms | |
| step:1403/1770 train_time:148779ms step_avg:106.04ms | |
| step:1404/1770 train_time:148889ms step_avg:106.05ms | |
| step:1405/1770 train_time:149000ms step_avg:106.05ms | |
| step:1406/1770 train_time:149109ms step_avg:106.05ms | |
| step:1407/1770 train_time:149220ms step_avg:106.06ms | |
| step:1408/1770 train_time:149330ms step_avg:106.06ms | |
| step:1409/1770 train_time:149440ms step_avg:106.06ms | |
| step:1410/1770 train_time:149550ms step_avg:106.06ms | |
| step:1411/1770 train_time:149660ms step_avg:106.07ms | |
| step:1412/1770 train_time:149770ms step_avg:106.07ms | |
| step:1413/1770 train_time:149880ms step_avg:106.07ms | |
| step:1414/1770 train_time:149991ms step_avg:106.08ms | |
| step:1415/1770 train_time:150105ms step_avg:106.08ms | |
| step:1416/1770 train_time:150211ms step_avg:106.08ms | |
| step:1417/1770 train_time:150321ms step_avg:106.08ms | |
| step:1418/1770 train_time:150431ms step_avg:106.09ms | |
| step:1419/1770 train_time:150541ms step_avg:106.09ms | |
| step:1420/1770 train_time:150652ms step_avg:106.09ms | |
| step:1421/1770 train_time:150763ms step_avg:106.10ms | |
| step:1422/1770 train_time:150876ms step_avg:106.10ms | |
| step:1423/1770 train_time:150986ms step_avg:106.10ms | |
| step:1424/1770 train_time:151095ms step_avg:106.11ms | |
| step:1425/1770 train_time:151205ms step_avg:106.11ms | |
| step:1426/1770 train_time:151314ms step_avg:106.11ms | |
| step:1427/1770 train_time:151423ms step_avg:106.11ms | |
| step:1428/1770 train_time:151533ms step_avg:106.12ms | |
| step:1429/1770 train_time:151643ms step_avg:106.12ms | |
| step:1430/1770 train_time:151753ms step_avg:106.12ms | |
| step:1431/1770 train_time:151864ms step_avg:106.12ms | |
| step:1432/1770 train_time:151975ms step_avg:106.13ms | |
| step:1433/1770 train_time:152086ms step_avg:106.13ms | |
| step:1434/1770 train_time:152196ms step_avg:106.13ms | |
| step:1435/1770 train_time:152308ms step_avg:106.14ms | |
| step:1436/1770 train_time:152415ms step_avg:106.14ms | |
| step:1437/1770 train_time:152525ms step_avg:106.14ms | |
| step:1438/1770 train_time:152635ms step_avg:106.14ms | |
| step:1439/1770 train_time:152745ms step_avg:106.15ms | |
| step:1440/1770 train_time:152855ms step_avg:106.15ms | |
| step:1441/1770 train_time:152966ms step_avg:106.15ms | |
| step:1442/1770 train_time:153077ms step_avg:106.16ms | |
| step:1443/1770 train_time:153188ms step_avg:106.16ms | |
| step:1444/1770 train_time:153300ms step_avg:106.16ms | |
| step:1445/1770 train_time:153410ms step_avg:106.17ms | |
| step:1446/1770 train_time:153522ms step_avg:106.17ms | |
| step:1447/1770 train_time:153634ms step_avg:106.17ms | |
| step:1448/1770 train_time:153746ms step_avg:106.18ms | |
| step:1449/1770 train_time:153857ms step_avg:106.18ms | |
| step:1450/1770 train_time:153969ms step_avg:106.19ms | |
| step:1451/1770 train_time:154081ms step_avg:106.19ms | |
| step:1452/1770 train_time:154192ms step_avg:106.19ms | |
| step:1453/1770 train_time:154304ms step_avg:106.20ms | |
| step:1454/1770 train_time:154415ms step_avg:106.20ms | |
| step:1455/1770 train_time:154526ms step_avg:106.20ms | |
| step:1456/1770 train_time:154637ms step_avg:106.21ms | |
| step:1457/1770 train_time:154750ms step_avg:106.21ms | |
| step:1458/1770 train_time:154862ms step_avg:106.22ms | |
| step:1459/1770 train_time:154973ms step_avg:106.22ms | |
| step:1460/1770 train_time:155084ms step_avg:106.22ms | |
| step:1461/1770 train_time:155197ms step_avg:106.23ms | |
| step:1462/1770 train_time:155309ms step_avg:106.23ms | |
| step:1463/1770 train_time:155419ms step_avg:106.23ms | |
| step:1464/1770 train_time:155530ms step_avg:106.24ms | |
| step:1465/1770 train_time:155640ms step_avg:106.24ms | |
| step:1466/1770 train_time:155751ms step_avg:106.24ms | |
| step:1467/1770 train_time:155862ms step_avg:106.25ms | |
| step:1468/1770 train_time:155973ms step_avg:106.25ms | |
| step:1469/1770 train_time:156084ms step_avg:106.25ms | |
| step:1470/1770 train_time:156195ms step_avg:106.26ms | |
| step:1471/1770 train_time:156305ms step_avg:106.26ms | |
| step:1472/1770 train_time:156416ms step_avg:106.26ms | |
| step:1473/1770 train_time:156527ms step_avg:106.26ms | |
| step:1474/1770 train_time:156639ms step_avg:106.27ms | |
| step:1475/1770 train_time:156750ms step_avg:106.27ms | |
| step:1476/1770 train_time:156862ms step_avg:106.27ms | |
| step:1477/1770 train_time:156973ms step_avg:106.28ms | |
| step:1478/1770 train_time:157085ms step_avg:106.28ms | |
| step:1479/1770 train_time:157198ms step_avg:106.29ms | |
| step:1480/1770 train_time:157309ms step_avg:106.29ms | |
| step:1481/1770 train_time:157420ms step_avg:106.29ms | |
| step:1482/1770 train_time:157531ms step_avg:106.30ms | |
| step:1483/1770 train_time:157645ms step_avg:106.30ms | |
| step:1484/1770 train_time:157756ms step_avg:106.30ms | |
| step:1485/1770 train_time:157866ms step_avg:106.31ms | |
| step:1486/1770 train_time:157977ms step_avg:106.31ms | |
| step:1487/1770 train_time:158088ms step_avg:106.31ms | |
| step:1488/1770 train_time:158200ms step_avg:106.32ms | |
| step:1489/1770 train_time:158311ms step_avg:106.32ms | |
| step:1490/1770 train_time:158421ms step_avg:106.32ms | |
| step:1491/1770 train_time:158532ms step_avg:106.33ms | |
| step:1492/1770 train_time:158646ms step_avg:106.33ms | |
| step:1493/1770 train_time:158754ms step_avg:106.33ms | |
| step:1494/1770 train_time:158865ms step_avg:106.34ms | |
| step:1495/1770 train_time:158977ms step_avg:106.34ms | |
| step:1496/1770 train_time:159088ms step_avg:106.34ms | |
| step:1497/1770 train_time:159203ms step_avg:106.35ms | |
| step:1498/1770 train_time:159312ms step_avg:106.35ms | |
| step:1499/1770 train_time:159424ms step_avg:106.35ms | |
| step:1500/1770 train_time:159535ms step_avg:106.36ms | |
| step:1500/1770 val_loss:3.9005 train_time:159539ms step_avg:106.36ms | |
| step:1501/1770 train_time:159651ms step_avg:106.36ms | |
| step:1502/1770 train_time:159762ms step_avg:106.37ms | |
| step:1503/1770 train_time:159874ms step_avg:106.37ms | |
| step:1504/1770 train_time:159986ms step_avg:106.37ms | |
| step:1505/1770 train_time:160098ms step_avg:106.38ms | |
| step:1506/1770 train_time:160209ms step_avg:106.38ms | |
| step:1507/1770 train_time:160321ms step_avg:106.38ms | |
| step:1508/1770 train_time:160431ms step_avg:106.39ms | |
| step:1509/1770 train_time:160544ms step_avg:106.39ms | |
| step:1510/1770 train_time:160655ms step_avg:106.39ms | |
| step:1511/1770 train_time:160767ms step_avg:106.40ms | |
| step:1512/1770 train_time:160878ms step_avg:106.40ms | |
| step:1513/1770 train_time:160989ms step_avg:106.40ms | |
| step:1514/1770 train_time:161100ms step_avg:106.41ms | |
| step:1515/1770 train_time:161212ms step_avg:106.41ms | |
| step:1516/1770 train_time:161324ms step_avg:106.41ms | |
| step:1517/1770 train_time:161436ms step_avg:106.42ms | |
| step:1518/1770 train_time:161547ms step_avg:106.42ms | |
| step:1519/1770 train_time:161659ms step_avg:106.42ms | |
| step:1520/1770 train_time:161771ms step_avg:106.43ms | |
| step:1521/1770 train_time:161886ms step_avg:106.43ms | |
| step:1522/1770 train_time:161994ms step_avg:106.43ms | |
| step:1523/1770 train_time:162105ms step_avg:106.44ms | |
| step:1524/1770 train_time:162216ms step_avg:106.44ms | |
| step:1525/1770 train_time:162328ms step_avg:106.44ms | |
| step:1526/1770 train_time:162441ms step_avg:106.45ms | |
| step:1527/1770 train_time:162553ms step_avg:106.45ms | |
| step:1528/1770 train_time:162664ms step_avg:106.46ms | |
| step:1529/1770 train_time:162776ms step_avg:106.46ms | |
| step:1530/1770 train_time:162887ms step_avg:106.46ms | |
| step:1531/1770 train_time:162998ms step_avg:106.46ms | |
| step:1532/1770 train_time:163110ms step_avg:106.47ms | |
| step:1533/1770 train_time:163221ms step_avg:106.47ms | |
| step:1534/1770 train_time:163332ms step_avg:106.47ms | |
| step:1535/1770 train_time:163443ms step_avg:106.48ms | |
| step:1536/1770 train_time:163554ms step_avg:106.48ms | |
| step:1537/1770 train_time:163666ms step_avg:106.48ms | |
| step:1538/1770 train_time:163777ms step_avg:106.49ms | |
| step:1539/1770 train_time:163888ms step_avg:106.49ms | |
| step:1540/1770 train_time:163999ms step_avg:106.49ms | |
| step:1541/1770 train_time:164111ms step_avg:106.50ms | |
| step:1542/1770 train_time:164220ms step_avg:106.50ms | |
| step:1543/1770 train_time:164331ms step_avg:106.50ms | |
| step:1544/1770 train_time:164441ms step_avg:106.50ms | |
| step:1545/1770 train_time:164552ms step_avg:106.51ms | |
| step:1546/1770 train_time:164665ms step_avg:106.51ms | |
| step:1547/1770 train_time:164777ms step_avg:106.51ms | |
| step:1548/1770 train_time:164890ms step_avg:106.52ms | |
| step:1549/1770 train_time:164998ms step_avg:106.52ms | |
| step:1550/1770 train_time:165109ms step_avg:106.52ms | |
| step:1551/1770 train_time:165224ms step_avg:106.53ms | |
| step:1552/1770 train_time:165333ms step_avg:106.53ms | |
| step:1553/1770 train_time:165444ms step_avg:106.53ms | |
| step:1554/1770 train_time:165554ms step_avg:106.53ms | |
| step:1555/1770 train_time:165666ms step_avg:106.54ms | |
| step:1556/1770 train_time:165777ms step_avg:106.54ms | |
| step:1557/1770 train_time:165887ms step_avg:106.54ms | |
| step:1558/1770 train_time:165998ms step_avg:106.55ms | |
| step:1559/1770 train_time:166109ms step_avg:106.55ms | |
| step:1560/1770 train_time:166220ms step_avg:106.55ms | |
| step:1561/1770 train_time:166332ms step_avg:106.55ms | |
| step:1562/1770 train_time:166442ms step_avg:106.56ms | |
| step:1563/1770 train_time:166556ms step_avg:106.56ms | |
| step:1564/1770 train_time:166664ms step_avg:106.56ms | |
| step:1565/1770 train_time:166776ms step_avg:106.57ms | |
| step:1566/1770 train_time:166886ms step_avg:106.57ms | |
| step:1567/1770 train_time:166997ms step_avg:106.57ms | |
| step:1568/1770 train_time:167108ms step_avg:106.57ms | |
| step:1569/1770 train_time:167220ms step_avg:106.58ms | |
| step:1570/1770 train_time:167331ms step_avg:106.58ms | |
| step:1571/1770 train_time:167443ms step_avg:106.58ms | |
| step:1572/1770 train_time:167554ms step_avg:106.59ms | |
| step:1573/1770 train_time:167664ms step_avg:106.59ms | |
| step:1574/1770 train_time:167782ms step_avg:106.60ms | |
| step:1575/1770 train_time:167890ms step_avg:106.60ms | |
| step:1576/1770 train_time:168000ms step_avg:106.60ms | |
| step:1577/1770 train_time:168113ms step_avg:106.60ms | |
| step:1578/1770 train_time:168224ms step_avg:106.61ms | |
| step:1579/1770 train_time:168339ms step_avg:106.61ms | |
| step:1580/1770 train_time:168450ms step_avg:106.61ms | |
| step:1581/1770 train_time:168559ms step_avg:106.62ms | |
| step:1582/1770 train_time:168670ms step_avg:106.62ms | |
| step:1583/1770 train_time:168783ms step_avg:106.62ms | |
| step:1584/1770 train_time:168893ms step_avg:106.62ms | |
| step:1585/1770 train_time:169004ms step_avg:106.63ms | |
| step:1586/1770 train_time:169117ms step_avg:106.63ms | |
| step:1587/1770 train_time:169229ms step_avg:106.63ms | |
| step:1588/1770 train_time:169341ms step_avg:106.64ms | |
| step:1589/1770 train_time:169453ms step_avg:106.64ms | |
| step:1590/1770 train_time:169565ms step_avg:106.64ms | |
| step:1591/1770 train_time:169676ms step_avg:106.65ms | |
| step:1592/1770 train_time:169788ms step_avg:106.65ms | |
| step:1593/1770 train_time:169901ms step_avg:106.65ms | |
| step:1594/1770 train_time:170012ms step_avg:106.66ms | |
| step:1595/1770 train_time:170125ms step_avg:106.66ms | |
| step:1596/1770 train_time:170234ms step_avg:106.66ms | |
| step:1597/1770 train_time:170344ms step_avg:106.67ms | |
| step:1598/1770 train_time:170457ms step_avg:106.67ms | |
| step:1599/1770 train_time:170568ms step_avg:106.67ms | |
| step:1600/1770 train_time:170678ms step_avg:106.67ms | |
| step:1601/1770 train_time:170788ms step_avg:106.68ms | |
| step:1602/1770 train_time:170900ms step_avg:106.68ms | |
| step:1603/1770 train_time:171009ms step_avg:106.68ms | |
| step:1604/1770 train_time:171121ms step_avg:106.68ms | |
| step:1605/1770 train_time:171231ms step_avg:106.69ms | |
| step:1606/1770 train_time:171343ms step_avg:106.69ms | |
| step:1607/1770 train_time:171454ms step_avg:106.69ms | |
| step:1608/1770 train_time:171565ms step_avg:106.69ms | |
| step:1609/1770 train_time:171679ms step_avg:106.70ms | |
| step:1610/1770 train_time:171790ms step_avg:106.70ms | |
| step:1611/1770 train_time:171901ms step_avg:106.70ms | |
| step:1612/1770 train_time:172012ms step_avg:106.71ms | |
| step:1613/1770 train_time:172124ms step_avg:106.71ms | |
| step:1614/1770 train_time:172235ms step_avg:106.71ms | |
| step:1615/1770 train_time:172345ms step_avg:106.72ms | |
| step:1616/1770 train_time:172457ms step_avg:106.72ms | |
| step:1617/1770 train_time:172570ms step_avg:106.72ms | |
| step:1618/1770 train_time:172684ms step_avg:106.73ms | |
| step:1619/1770 train_time:172793ms step_avg:106.73ms | |
| step:1620/1770 train_time:172904ms step_avg:106.73ms | |
| step:1621/1770 train_time:173019ms step_avg:106.74ms | |
| step:1622/1770 train_time:173130ms step_avg:106.74ms | |
| step:1623/1770 train_time:173241ms step_avg:106.74ms | |
| step:1624/1770 train_time:173354ms step_avg:106.74ms | |
| step:1625/1770 train_time:173467ms step_avg:106.75ms | |
| step:1625/1770 val_loss:3.8464 train_time:173470ms step_avg:106.75ms | |
| step:1626/1770 train_time:173582ms step_avg:106.75ms | |
| step:1627/1770 train_time:173694ms step_avg:106.76ms | |
| step:1628/1770 train_time:173805ms step_avg:106.76ms | |
| step:1629/1770 train_time:173916ms step_avg:106.76ms | |
| step:1630/1770 train_time:174029ms step_avg:106.77ms | |
| step:1631/1770 train_time:174140ms step_avg:106.77ms | |
| step:1632/1770 train_time:174251ms step_avg:106.77ms | |
| step:1633/1770 train_time:174362ms step_avg:106.77ms | |
| step:1634/1770 train_time:174473ms step_avg:106.78ms | |
| step:1635/1770 train_time:174584ms step_avg:106.78ms | |
| step:1636/1770 train_time:174696ms step_avg:106.78ms | |
| step:1637/1770 train_time:174807ms step_avg:106.79ms | |
| step:1638/1770 train_time:174919ms step_avg:106.79ms | |
| step:1639/1770 train_time:175032ms step_avg:106.79ms | |
| step:1640/1770 train_time:175143ms step_avg:106.79ms | |
| step:1641/1770 train_time:175256ms step_avg:106.80ms | |
| step:1642/1770 train_time:175365ms step_avg:106.80ms | |
| step:1643/1770 train_time:175476ms step_avg:106.80ms | |
| step:1644/1770 train_time:175587ms step_avg:106.80ms | |
| step:1645/1770 train_time:175699ms step_avg:106.81ms | |
| step:1646/1770 train_time:175812ms step_avg:106.81ms | |
| step:1647/1770 train_time:175922ms step_avg:106.81ms | |
| step:1648/1770 train_time:176033ms step_avg:106.82ms | |
| step:1649/1770 train_time:176145ms step_avg:106.82ms | |
| step:1650/1770 train_time:176255ms step_avg:106.82ms | |
| step:1651/1770 train_time:176366ms step_avg:106.82ms | |
| step:1652/1770 train_time:176477ms step_avg:106.83ms | |
| step:1653/1770 train_time:176590ms step_avg:106.83ms | |
| step:1654/1770 train_time:176701ms step_avg:106.83ms | |
| step:1655/1770 train_time:176811ms step_avg:106.83ms | |
| step:1656/1770 train_time:176922ms step_avg:106.84ms | |
| step:1657/1770 train_time:177033ms step_avg:106.84ms | |
| step:1658/1770 train_time:177145ms step_avg:106.84ms | |
| step:1659/1770 train_time:177256ms step_avg:106.85ms | |
| step:1660/1770 train_time:177368ms step_avg:106.85ms | |
| step:1661/1770 train_time:177482ms step_avg:106.85ms | |
| step:1662/1770 train_time:177593ms step_avg:106.86ms | |
| step:1663/1770 train_time:177704ms step_avg:106.86ms | |
| step:1664/1770 train_time:177816ms step_avg:106.86ms | |
| step:1665/1770 train_time:177928ms step_avg:106.86ms | |
| step:1666/1770 train_time:178040ms step_avg:106.87ms | |
| step:1667/1770 train_time:178152ms step_avg:106.87ms | |
| step:1668/1770 train_time:178263ms step_avg:106.87ms | |
| step:1669/1770 train_time:178376ms step_avg:106.88ms | |
| step:1670/1770 train_time:178487ms step_avg:106.88ms | |
| step:1671/1770 train_time:178597ms step_avg:106.88ms | |
| step:1672/1770 train_time:178708ms step_avg:106.88ms | |
| step:1673/1770 train_time:178819ms step_avg:106.88ms | |
| step:1674/1770 train_time:178929ms step_avg:106.89ms | |
| step:1675/1770 train_time:179041ms step_avg:106.89ms | |
| step:1676/1770 train_time:179152ms step_avg:106.89ms | |
| step:1677/1770 train_time:179264ms step_avg:106.90ms | |
| step:1678/1770 train_time:179376ms step_avg:106.90ms | |
| step:1679/1770 train_time:179487ms step_avg:106.90ms | |
| step:1680/1770 train_time:179598ms step_avg:106.90ms | |
| step:1681/1770 train_time:179709ms step_avg:106.91ms | |
| step:1682/1770 train_time:179821ms step_avg:106.91ms | |
| step:1683/1770 train_time:179933ms step_avg:106.91ms | |
| step:1684/1770 train_time:180043ms step_avg:106.91ms | |
| step:1685/1770 train_time:180154ms step_avg:106.92ms | |
| step:1686/1770 train_time:180265ms step_avg:106.92ms | |
| step:1687/1770 train_time:180376ms step_avg:106.92ms | |
| step:1688/1770 train_time:180487ms step_avg:106.92ms | |
| step:1689/1770 train_time:180598ms step_avg:106.93ms | |
| step:1690/1770 train_time:180710ms step_avg:106.93ms | |
| step:1691/1770 train_time:180821ms step_avg:106.93ms | |
| step:1692/1770 train_time:180934ms step_avg:106.94ms | |
| step:1693/1770 train_time:181045ms step_avg:106.94ms | |
| step:1694/1770 train_time:181157ms step_avg:106.94ms | |
| step:1695/1770 train_time:181268ms step_avg:106.94ms | |
| step:1696/1770 train_time:181380ms step_avg:106.95ms | |
| step:1697/1770 train_time:181492ms step_avg:106.95ms | |
| step:1698/1770 train_time:181603ms step_avg:106.95ms | |
| step:1699/1770 train_time:181715ms step_avg:106.95ms | |
| step:1700/1770 train_time:181825ms step_avg:106.96ms | |
| step:1701/1770 train_time:181937ms step_avg:106.96ms | |
| step:1702/1770 train_time:182049ms step_avg:106.96ms | |
| step:1703/1770 train_time:182160ms step_avg:106.96ms | |
| step:1704/1770 train_time:182272ms step_avg:106.97ms | |
| step:1705/1770 train_time:182383ms step_avg:106.97ms | |
| step:1706/1770 train_time:182494ms step_avg:106.97ms | |
| step:1707/1770 train_time:182609ms step_avg:106.98ms | |
| step:1708/1770 train_time:182721ms step_avg:106.98ms | |
| step:1709/1770 train_time:182834ms step_avg:106.98ms | |
| step:1710/1770 train_time:182945ms step_avg:106.99ms | |
| step:1711/1770 train_time:183056ms step_avg:106.99ms | |
| step:1712/1770 train_time:183167ms step_avg:106.99ms | |
| step:1713/1770 train_time:183281ms step_avg:106.99ms | |
| step:1714/1770 train_time:183394ms step_avg:107.00ms | |
| step:1715/1770 train_time:183508ms step_avg:107.00ms | |
| step:1716/1770 train_time:183621ms step_avg:107.01ms | |
| step:1717/1770 train_time:183735ms step_avg:107.01ms | |
| step:1718/1770 train_time:183848ms step_avg:107.01ms | |
| step:1719/1770 train_time:183960ms step_avg:107.02ms | |
| step:1720/1770 train_time:184071ms step_avg:107.02ms | |
| step:1721/1770 train_time:184183ms step_avg:107.02ms | |
| step:1722/1770 train_time:184293ms step_avg:107.02ms | |
| step:1723/1770 train_time:184407ms step_avg:107.03ms | |
| step:1724/1770 train_time:184520ms step_avg:107.03ms | |
| step:1725/1770 train_time:184632ms step_avg:107.03ms | |
| step:1726/1770 train_time:184746ms step_avg:107.04ms | |
| step:1727/1770 train_time:184858ms step_avg:107.04ms | |
| step:1728/1770 train_time:184971ms step_avg:107.04ms | |
| step:1729/1770 train_time:185082ms step_avg:107.05ms | |
| step:1730/1770 train_time:185194ms step_avg:107.05ms | |
| step:1731/1770 train_time:185304ms step_avg:107.05ms | |
| step:1732/1770 train_time:185417ms step_avg:107.05ms | |
| step:1733/1770 train_time:185529ms step_avg:107.06ms | |
| step:1734/1770 train_time:185639ms step_avg:107.06ms | |
| step:1735/1770 train_time:185754ms step_avg:107.06ms | |
| step:1736/1770 train_time:185866ms step_avg:107.07ms | |
| step:1737/1770 train_time:185978ms step_avg:107.07ms | |
| step:1738/1770 train_time:186089ms step_avg:107.07ms | |
| step:1739/1770 train_time:186201ms step_avg:107.07ms | |
| step:1740/1770 train_time:186313ms step_avg:107.08ms | |
| step:1741/1770 train_time:186426ms step_avg:107.08ms | |
| step:1742/1770 train_time:186540ms step_avg:107.08ms | |
| step:1743/1770 train_time:186651ms step_avg:107.09ms | |
| step:1744/1770 train_time:186763ms step_avg:107.09ms | |
| step:1745/1770 train_time:186876ms step_avg:107.09ms | |
| step:1746/1770 train_time:186987ms step_avg:107.09ms | |
| step:1747/1770 train_time:187098ms step_avg:107.10ms | |
| step:1748/1770 train_time:187211ms step_avg:107.10ms | |
| step:1749/1770 train_time:187326ms step_avg:107.10ms | |
| step:1750/1770 train_time:187438ms step_avg:107.11ms | |
| step:1750/1770 val_loss:3.8014 train_time:187443ms step_avg:107.11ms | |
| step:1751/1770 train_time:187556ms step_avg:107.11ms | |
| step:1752/1770 train_time:187669ms step_avg:107.12ms | |
| step:1753/1770 train_time:187780ms step_avg:107.12ms | |
| step:1754/1770 train_time:187892ms step_avg:107.12ms | |
| step:1755/1770 train_time:188005ms step_avg:107.13ms | |
| step:1756/1770 train_time:188117ms step_avg:107.13ms | |
| step:1757/1770 train_time:188229ms step_avg:107.13ms | |
| step:1758/1770 train_time:188341ms step_avg:107.13ms | |
| step:1759/1770 train_time:188454ms step_avg:107.14ms | |
| step:1760/1770 train_time:188565ms step_avg:107.14ms | |
| step:1761/1770 train_time:188677ms step_avg:107.14ms | |
| step:1762/1770 train_time:188790ms step_avg:107.15ms | |
| step:1763/1770 train_time:188901ms step_avg:107.15ms | |
| step:1764/1770 train_time:189014ms step_avg:107.15ms | |
| step:1765/1770 train_time:189130ms step_avg:107.16ms | |
| step:1766/1770 train_time:189243ms step_avg:107.16ms | |
| step:1767/1770 train_time:189354ms step_avg:107.16ms | |
| step:1768/1770 train_time:189465ms step_avg:107.16ms | |
| step:1769/1770 train_time:189578ms step_avg:107.17ms | |
| step:1770/1770 train_time:189691ms step_avg:107.17ms | |
| step:1770/1770 val_loss:3.7963 train_time:189695ms step_avg:107.17ms | |
| peak memory allocated: 30868 MiB reserved: 45252 MiB |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment