import os import sys with open(sys.argv[0]) as f: code = f.read() # read the code of this file ASAP, for logging import uuid import time from dataclasses import dataclass from functools import lru_cache from pathlib import Path os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" import torch torch.empty(1, device="cuda", requires_grad=True).backward() # prevents a bug on some systems from torch import Tensor, nn import torch.nn.functional as F import torch.distributed as dist # use of FlexAttention contributed by @KoszarskyB from torch.nn.attention.flex_attention import BlockMask, flex_attention torch._inductor.config.coordinate_descent_tuning = True # ----------------------------------------------------------------------------- # Custom operators : FP8 matmul for lm_head by @YouJiacheng @torch.library.custom_op("nanogpt::mm", mutates_args=()) def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: @torch.compile def impl(x: Tensor, w: Tensor): assert x.is_contiguous() and w.is_contiguous() x_f8 = x.mul(x_s).to(torch.float8_e4m3fn) w_f8 = w.mul(w_s).to(torch.float8_e4m3fn) out = torch._scaled_mm( x_f8, w_f8.t(), out_dtype=torch.bfloat16, scale_a=x.new_tensor(1 / x_s, dtype=torch.float32), scale_b=x.new_tensor(1 / w_s, dtype=torch.float32), use_fast_accum=True, ) return out, x_f8, w_f8 return impl(x, w) @mm_op.register_fake def _(x: Tensor, w: Tensor, *_): assert x.ndim == w.ndim == 2 assert x.shape[1] == w.shape[1] assert x.device == w.device assert x.is_contiguous() and w.is_contiguous() return x @ w.t(), x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) @torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: @torch.compile def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): assert grad.is_contiguous() x_inv_s = grad.new_tensor(1 / x_s, dtype=torch.float32) w_inv_s = grad.new_tensor(1 / w_s, dtype=torch.float32) grad_inv_s = grad.new_tensor(1 / grad_s, dtype=torch.float32) grad_f8 = grad.mul(grad_s).to(torch.float8_e5m2) grad_x = torch._scaled_mm( grad_f8, w_f8.t().contiguous().t(), out_dtype=torch.bfloat16, scale_a=grad_inv_s, scale_b=w_inv_s, use_fast_accum=False, ) # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) grad_w = torch._scaled_mm( x_f8.t().contiguous(), grad_f8.t().contiguous().t(), out_dtype=torch.float32, scale_a=x_inv_s, scale_b=grad_inv_s, use_fast_accum=False, ).t() return grad_x, grad_w return impl(g, x_f8, w_f8) @mm_backward_op.register_fake def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): return x_f8.to(torch.bfloat16), w_f8.to(torch.float32) def backward(ctx, grad_out: Tensor, *_): x_f8, w_f8 = ctx.saved_tensors x_s, w_s, grad_s = ctx.scales grad_x, grad_w = torch.ops.nanogpt.mm_backward( grad_out, x_f8, w_f8, x_s, w_s, grad_s ) return grad_x, grad_w, None, None, None def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): *_, x_s, w_s, grad_s = inputs _, x_f8, w_f8 = output ctx.save_for_backward(x_f8, w_f8) ctx.scales = x_s, w_s, grad_s ctx.set_materialize_grads(False) mm_op.register_autograd(backward, setup_context=setup_context) def lm_head_fp8(x: Tensor, w: Tensor) -> Tensor: _x = x.flatten(0, -2) out: Tensor = torch.ops.nanogpt.mm(_x, w, x_s=2.0, w_s=32.0, grad_s=2.0**29)[0] return out.reshape(*x.shape[:-1], -1) # ----------------------------------------------------------------------------- # Muon optimizer @torch.compile def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing the slope at zero even beyond the point where the iteration no longer converges all the way to one everywhere on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model performance at all relative to UV^T, where USV^T = G is the SVD. """ assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng a, b, c = (3.4445, -4.7750, 2.0315) X = G.bfloat16() if G.size(-2) > G.size(-1): X = X.mT # Ensure spectral norm is at most 1 X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7) # Perform the NS iterations for _ in range(steps): A = X @ X.mT B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng X = a * X + B @ X if G.size(-2) > G.size(-1): X = X.mT return X class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU. Some warnings: - This optimizer assumes that all parameters passed in are 2D. - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - We believe it is unlikely to work well for training with small batch size. - We believe it may not work well for finetuning pretrained models, but we haven"t tested this. - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. """ def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5, rank=0, world_size=1): self.rank = rank self.world_size = world_size defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) params: list[Tensor] = [*params] assert all(isinstance(p, Tensor) for p in params) sizes = {p.numel() for p in params} def create_update_buffer(size: int): b = torch.empty(self.world_size, size, dtype=torch.bfloat16, device="cuda") return dict(update_buffer=b, update_buffer_views=[b[i] for i in range(self.world_size)]) param_groups = [ dict(params=[p for p in params if p.numel() == size], **create_update_buffer(size)) for size in sizes] super().__init__(param_groups, defaults) @torch.no_grad() def step(self): for group in self.param_groups: lr = group["lr"] momentum = group["momentum"] nesterov = group["nesterov"] ns_steps = group["ns_steps"] update_buffer = group["update_buffer"] update_buffer_views: list[Tensor] = group["update_buffer_views"] # generate weight updates in distributed fashion params: list[Tensor] = group["params"] handle = None params_world = None def update_prev(): # optimized Muon implementation contributed by @YouJiacheng if params_world is None: return assert handle is not None handle.wait() for p_world, g_world in zip(params_world, update_buffer_views): p_world.add_( g_world.view_as(p_world), alpha=-lr * max(1, p_world.size(-2) / p_world.size(-1)) ** 0.5, ) for base_i in range(len(params))[::self.world_size]: if base_i + self.rank < len(params): p = params[base_i + self.rank] g = p.grad assert g is not None state = self.state[p] if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros_like(g) buf: Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - momentum) g = g.lerp_(buf, momentum) if nesterov else buf g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() else: g = update_buffer_views[self.rank] update_prev() # async all_gather instead of sync all_reduce by @YouJiacheng handle = dist.all_gather_into_tensor(update_buffer, g, async_op=True) params_world = params[base_i : base_i + self.world_size] update_prev() # ----------------------------------------------------------------------------- # PyTorch nn.Module definitions for the model def norm(x): return F.rms_norm(x, (x.size(-1),)) class CastedLinear(nn.Linear): def __init__(self, in_features: int, out_features: int): super().__init__(in_features, out_features, bias=False) def reset_parameters(self) -> None: std = 0.5 * (self.in_features ** -0.5) # 0.5 is a bit better than the default 1/sqrt(3) bound = (3 ** 0.5) * std with torch.no_grad(): self.weight.uniform_(-bound, bound) def forward(self, x): return F.linear(x, self.weight.type_as(x)) class Rotary(nn.Module): def __init__(self, dim: int, max_seq_len=65536): super().__init__() # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim//4, dtype=torch.float32) angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim//4)]) t = torch.arange(max_seq_len, dtype=torch.float32) theta = torch.einsum("i,j -> ij", t, angular_freq) self.cos = nn.Buffer(theta.cos(), persistent=False) self.sin = nn.Buffer(theta.sin(), persistent=False) def forward(self, x_BTHD: Tensor): assert self.cos.size(0) >= x_BTHD.size(-3) cos, sin = self.cos[None, :x_BTHD.size(-3), None, :], self.sin[None, :x_BTHD.size(-3), None, :] x1, x2 = x_BTHD.to(dtype=torch.float32).chunk(2, dim=-1) y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos return torch.cat((y1, y2), 3).type_as(x_BTHD) class CausalSelfAttention(nn.Module): def __init__(self, dim: int, num_heads: int, layer_idx: int): super().__init__() assert dim % num_heads == 0 self.num_heads = num_heads std = 0.5 * (dim ** -0.5) bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng # https://x.com/hi_tysam/status/1879699187107033311 self.qkv_w = nn.Parameter(torch.empty(3, dim, dim).uniform_(-bound, bound)) self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim self.c_proj = CastedLinear(dim, dim) self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 # scale the attention logits by given constant, instead of the default head_dim**-0.5, by @leloykun # inspired by learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 self.attn_scale = 0.12 def forward(self, x: Tensor, ve: Tensor | None, block_mask: BlockMask): B, T = x.size(0), x.size(1) # batch size, sequence length assert B == 1, "Must use batch size = 1 for FlexAttention" q, k, v = F.linear(x, self.qkv_w.flatten(end_dim=1).type_as(x)).view(B, T, 3*self.num_heads, -1).chunk(3, dim=-2) if ve is not None: v = self.lambdas[0] * v + self.lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977 else: # skip mid-layers token value embeddings by @YouJiacheng v = self.lambdas[0] * v q, k = norm(q), norm(k) # QK norm @Grad62304977 q, k = self.rotary(q), self.rotary(k) y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, scale=self.attn_scale) y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side y = self.c_proj(y) return y class MLP(nn.Module): def __init__(self, dim): super().__init__() self.c_fc = CastedLinear(dim, 4 * dim) self.c_proj = CastedLinear(4 * dim, dim) self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 def forward(self, x): x = self.c_fc(x) x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 x = self.c_proj(x) return x class Block(nn.Module): def __init__(self, model_dim: int, num_heads: int, layer_idx: int): super().__init__() # skip attention of blocks.7 (the 8th layer) by @YouJiacheng self.attn = CausalSelfAttention(model_dim, num_heads, layer_idx) if layer_idx != 7 else None self.mlp = MLP(model_dim) self.lambdas = nn.Parameter(torch.tensor([1., 0.])) def forward(self, x, ve, x0, block_mask): x = self.lambdas[0] * x + self.lambdas[1] * x0 if self.attn is not None: x = x + self.attn(norm(x), ve, block_mask) x = x + self.mlp(norm(x)) return x class ValueEmbedding(nn.Module): def __init__(self, num_embeddings: int, embedding_dim: int): super().__init__() self.embed = nn.ModuleList([nn.Embedding(num_embeddings, embedding_dim) for _ in range(3)]) def forward(self, input_seq) -> list[Tensor | None]: ve = [emb(input_seq) for emb in self.embed] # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure ve = [ve[0], ve[1], ve[2], None, None, None, None, None, None, ve[0], ve[1], ve[2]] return ve # ----------------------------------------------------------------------------- # The main model def next_multiple_of_n(v: float | int, *, n: int): return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) class GPT(nn.Module): def __init__(self, vocab_size: int, num_layers: int, num_heads: int, model_dim: int): super().__init__() self.embed = nn.Embedding(vocab_size, model_dim) # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 self.value_embeds = ValueEmbedding(vocab_size, model_dim) self.blocks = nn.ModuleList([Block(model_dim, num_heads, layer_idx) for layer_idx in range(num_layers)]) # U-net design by @brendanh0gan self.num_encoder_layers = num_layers // 2 # Half of the layers for encoder self.num_decoder_layers = num_layers - self.num_encoder_layers # Remaining for decoder # Add learnable skip connection weights for decoder layers self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. # suggested to me by @Grad62304977. this originates from Karpathy's experiments. self.lm_head = CastedLinear(model_dim, next_multiple_of_n(vocab_size, n=128)) self.lm_head.weight.detach().zero_() # @Grad62304977 def forward(self, input_seq: Tensor, target_seq: Tensor, sliding_window_num_blocks: Tensor): BLOCK_SIZE = 128 assert input_seq.ndim == 1 assert len(input_seq) % BLOCK_SIZE == 0 NUM_BLOCKS = len(input_seq) // BLOCK_SIZE docs = (input_seq == 50256).cumsum(0) docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() def document_causal(b, h, q_idx, kv_idx): causal_mask = q_idx >= kv_idx document_mask = docs[q_idx] == docs[kv_idx] return causal_mask & document_mask def dense_to_ordered(dense_mask: Tensor): num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) indices = dense_mask.argsort(dim=-1, descending=False, stable=True).flip(-1).to(torch.int32) return num_blocks[None, None].contiguous(), indices[None, None].contiguous() # manual block mask creation by @YouJiacheng def create_doc_swc_block_masks(sliding_window_num_blocks: Tensor): kv_idx = block_idx = torch.arange(NUM_BLOCKS, dtype=torch.int32, device="cuda") q_idx = block_idx[:, None] causal_bm = q_idx >= kv_idx causal_full_bm = q_idx > kv_idx document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) nonzero_bm = causal_bm & document_bm full_bm = causal_full_bm & document_full_bm kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm & ~full_bm) full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) def build_bm(sw_num_blocks: Tensor) -> BlockMask: return BlockMask.from_kv_blocks( torch.clamp_max(kv_num_blocks, torch.clamp_min(sw_num_blocks - full_kv_num_blocks, 1)), kv_indices, torch.clamp_max(full_kv_num_blocks, sw_num_blocks - 1), full_kv_indices, BLOCK_SIZE=BLOCK_SIZE, mask_mod=document_causal, ) return build_bm(sliding_window_num_blocks), build_bm(sliding_window_num_blocks // 2) # Long-short SWA block masks by @leloykun & @YouJiacheng, adapated from suggestion by @Grad62304977, following Gemma 2 paper long_bm, short_bm = create_doc_swc_block_masks(sliding_window_num_blocks) x = x0 = norm(self.embed(input_seq)[None]) # use of norm here by @Grad62304977 ve = self.value_embeds(input_seq) assert len(ve) == len(self.blocks) ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] assert len(ve_enc) == self.num_encoder_layers and len(ve_dec) == self.num_decoder_layers # Store outputs for U-Net skip connections skip_connections = [] # Encoder pass - process only the first half of the blocks block_masks = [long_bm, short_bm, short_bm, short_bm, long_bm, short_bm] for i in range(self.num_encoder_layers): x = self.blocks[i](x, ve_enc[i], x0, block_masks[i]) skip_connections.append(x) # Decoder pass - process the remaining blocks with weighted skip connections block_masks.reverse() for i in range(self.num_decoder_layers): x = x + self.skip_weights[i] * skip_connections.pop() x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_masks[i]) x = norm(x) logits = lm_head_fp8(x, self.lm_head.weight) if self.training else self.lm_head(x) # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) logits = 30 * torch.sigmoid(logits.float() / 7.5) loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_seq) return loss # ----------------------------------------------------------------------------- # Our own simple Distributed Data Loader def _load_data_shard(file: Path): header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) # header is 256 int32 assert header[0] == 20240520, "magic number mismatch in the data .bin file" assert header[1] == 1, "unsupported version" num_tokens = int(header[2]) # number of tokens (claimed) with file.open("rb", buffering=0) as f: tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng f.seek(256 * 4) nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng assert nbytes == 2 * num_tokens, "number of tokens read does not match header" return tokens def distributed_data_generator(filename_pattern: str, batch_size: int, rank : int, world_size : int): files = sorted(Path.cwd().glob(filename_pattern)) assert batch_size % world_size == 0 local_batch_size = batch_size // world_size file_iter = iter(files) # use itertools.cycle(files) instead if you want to do multi-epoch training tokens, pos = _load_data_shard(next(file_iter)), 0 while True: if pos + batch_size + 1 >= len(tokens): tokens, pos = _load_data_shard(next(file_iter)), 0 buf = tokens[pos + rank * local_batch_size:][:local_batch_size + 1] inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # no sync on host side; targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # H2D in another stream isn"t helpful. pos += batch_size yield inputs, targets # ----------------------------------------------------------------------------- # int main @dataclass class Hyperparameters: # data train_files = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on val_files = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons # optimization batch_size = 8*64*1024 # batch size in tokens num_iterations = 1393 # number of iterations to run cooldown_frac = 0.4 # fraction of training spent cooling down the learning rate # evaluation and logging val_loss_every = 125 # every how many steps to evaluate val loss? 0 for only at the end # implementation seq_len = 64*1024 # FlexAttention sequence length save_checkpoint = False args = Hyperparameters() # torchrun sets these env variables rank = int(os.environ["RANK"]) world_size = int(os.environ["WORLD_SIZE"]) assert torch.cuda.is_available() device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) torch.cuda.set_device(device) dist.init_process_group(backend="nccl", device_id=device) dist.barrier() master_process = (rank == 0) # this process will do logging, checkpointing etc. # begin logging logfile = None if master_process: run_id = uuid.uuid4() os.makedirs("logs", exist_ok=True) logfile = f"logs/{run_id}.txt" print(logfile) def print0(s, console=False): if master_process: with open(logfile, "a") as f: if console: print(s) print(s, file=f) # begin by printing this file (the Python code) print0(code) print0("="*100) # log information about the hardware/software environment this is running on print0(f"Running Python {sys.version}") print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") def nvidia_smi(): import subprocess # avoid top level import return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout print0(nvidia_smi()) print0("="*100) # load data train_loader = distributed_data_generator(args.train_files, args.batch_size, rank, world_size) model = GPT(vocab_size=50257, num_layers=12, num_heads=6, model_dim=768).cuda() for m in model.modules(): if isinstance(m, nn.Embedding): m.bfloat16() for param in model.parameters(): dist.broadcast(param.detach(), 0) # collect the parameters to optimize hidden_matrix_params = [p for p in model.blocks.parameters() if p.ndim >= 2] embed_params = [model.embed.weight, *model.value_embeds.parameters()] scalar_params = [p for p in model.parameters() if p.ndim < 2] head_params = [model.lm_head.weight] # init the optimizer(s) adam_params = [dict(params=head_params, lr=0.008), dict(params=embed_params, lr=0.6), dict(params=scalar_params, lr=0.04)] # small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence # discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 optimizer1 = torch.optim.Adam(adam_params, betas=(0.8, 0.95), fused=True, eps=1e-10) optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95, rank=rank, world_size=world_size) optimizers = [optimizer1, optimizer2] # learning rate schedule: stable then decay def get_lr(it: int): t = 1 - it / args.num_iterations # time remaining in training assert 1 >= t >= 0 w = min(t / args.cooldown_frac, 1.0) # 1 -> 0 return w * 1.0 + (1 - w) * 0.1 schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] @lru_cache(1) def sw_num_blks(window_size: int): return torch.tensor(window_size // 128, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True) model: nn.Module = torch.compile(model) training_time_ms = 0 # start the clock torch.cuda.synchronize() t0 = time.perf_counter() # begin training train_steps = args.num_iterations for step in range(train_steps + 1): last_step = (step == train_steps) # This effectively ignores timing first 10 steps, which are slower for weird reasons. # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 # steps with dummy data first, and then re-initialize the model and reset the loader. if step == 10: training_time_ms = 0 t0 = time.perf_counter() timed_steps = float("nan") if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val # Linearly increase the block-wise sliding window size over training 128 -> 1792: # increase by @fernbear.bsky.social; block-wise by @YouJiacheng window_size = next_multiple_of_n(1728 * step / train_steps, n=128) # --------------- VALIDATION SECTION ----------------- if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): # stop the clock torch.cuda.synchronize() training_time_ms += 1000 * (time.perf_counter() - t0) model.eval() val_bs = world_size * args.seq_len assert args.val_tokens % val_bs == 0 val_steps = args.val_tokens // val_bs val_loader = distributed_data_generator(args.val_files, val_bs, rank, world_size) val_loss = 0 with torch.no_grad(): for _ in range(val_steps): x, y = next(val_loader) val_loss += model(x, y, sw_num_blks(window_size)) val_loss /= val_steps del val_loader dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms", console=True) model.train() # start the clock again torch.cuda.synchronize() t0 = time.perf_counter() if last_step: if master_process and args.save_checkpoint: log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) os.makedirs(f"logs/{run_id}", exist_ok=True) torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") # the last step only has the validation loop, so break to avoid training break # --------------- TRAINING SECTION BEGIN ----------------- inputs, targets = next(train_loader) for input_seq, target_seq in zip(inputs.split(args.seq_len), targets.split(args.seq_len)): model(input_seq, target_seq, sw_num_blks(window_size)).backward() for param in model.parameters(): dist.all_reduce(param.grad, op=dist.ReduceOp.AVG) # momentum warmup for Muon frac = min(step / 300, 1) for group in optimizer2.param_groups: group["momentum"] = (1 - frac) * 0.85 + frac * 0.95 # step the optimizers and schedulers for opt, sched in zip(optimizers, schedulers): opt.step() sched.step() # null the gradients model.zero_grad(set_to_none=True) # logging approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) print0(f"step:{step+1}/{train_steps} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms", console=True) print0( f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" ) dist.destroy_process_group() ==================================================================================================== Running Python 3.12.7 (main, Jan 17 2025, 03:57:17) [GCC 13.2.0] Running PyTorch 2.7.0.dev20250110+cu126 compiled for CUDA 12.6 Fri Jan 17 08:30:35 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 550.127.05 Driver Version: 550.127.05 CUDA Version: 12.6 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA H100 80GB HBM3 On | 00000000:61:00.0 Off | 0 | | N/A 32C P0 117W / 700W | 7746MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 1 NVIDIA H100 80GB HBM3 On | 00000000:62:00.0 Off | 0 | | N/A 37C P0 123W / 700W | 3456MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 2 NVIDIA H100 80GB HBM3 On | 00000000:63:00.0 Off | 0 | | N/A 39C P0 121W / 700W | 3456MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 3 NVIDIA H100 80GB HBM3 On | 00000000:64:00.0 Off | 0 | | N/A 32C P0 120W / 700W | 3456MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 4 NVIDIA H100 80GB HBM3 On | 00000000:6A:00.0 Off | 0 | | N/A 34C P0 121W / 700W | 3456MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 5 NVIDIA H100 80GB HBM3 On | 00000000:6B:00.0 Off | 0 | | N/A 39C P0 123W / 700W | 3456MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 6 NVIDIA H100 80GB HBM3 On | 00000000:6C:00.0 Off | 0 | | N/A 40C P0 124W / 700W | 3456MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 7 NVIDIA H100 80GB HBM3 On | 00000000:6D:00.0 Off | 0 | | N/A 36C P0 120W / 700W | 3216MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| +-----------------------------------------------------------------------------------------+ ==================================================================================================== step:0/1393 val_loss:10.8258 train_time:0ms step_avg:nanms step:1/1393 train_time:17608ms step_avg:nanms step:2/1393 train_time:17900ms step_avg:nanms step:3/1393 train_time:18019ms step_avg:nanms step:4/1393 train_time:18138ms step_avg:nanms step:5/1393 train_time:18260ms step_avg:nanms step:6/1393 train_time:18382ms step_avg:nanms step:7/1393 train_time:18503ms step_avg:nanms step:8/1393 train_time:18624ms step_avg:nanms step:9/1393 train_time:18746ms step_avg:nanms step:10/1393 train_time:18872ms step_avg:nanms step:11/1393 train_time:124ms step_avg:nanms step:12/1393 train_time:248ms step_avg:nanms step:13/1393 train_time:370ms step_avg:123.47ms step:14/1393 train_time:494ms step_avg:123.57ms step:15/1393 train_time:616ms step_avg:123.17ms step:16/1393 train_time:738ms step_avg:123.02ms step:17/1393 train_time:862ms step_avg:123.08ms step:18/1393 train_time:984ms step_avg:122.97ms step:19/1393 train_time:1107ms step_avg:122.95ms step:20/1393 train_time:1229ms step_avg:122.93ms step:21/1393 train_time:1351ms step_avg:122.85ms step:22/1393 train_time:1475ms step_avg:122.93ms step:23/1393 train_time:1597ms step_avg:122.86ms step:24/1393 train_time:1720ms step_avg:122.86ms step:25/1393 train_time:1843ms step_avg:122.84ms step:26/1393 train_time:1966ms step_avg:122.87ms step:27/1393 train_time:2088ms step_avg:122.82ms step:28/1393 train_time:2211ms step_avg:122.84ms step:29/1393 train_time:2335ms step_avg:122.90ms step:30/1393 train_time:2458ms step_avg:122.91ms step:31/1393 train_time:2582ms step_avg:122.96ms step:32/1393 train_time:2705ms step_avg:122.94ms step:33/1393 train_time:2827ms step_avg:122.90ms step:34/1393 train_time:2950ms step_avg:122.92ms step:35/1393 train_time:3074ms step_avg:122.97ms step:36/1393 train_time:3200ms step_avg:123.06ms step:37/1393 train_time:3323ms step_avg:123.07ms step:38/1393 train_time:3445ms step_avg:123.05ms step:39/1393 train_time:3569ms step_avg:123.06ms step:40/1393 train_time:3693ms step_avg:123.08ms step:41/1393 train_time:3815ms step_avg:123.05ms step:42/1393 train_time:3937ms step_avg:123.03ms step:43/1393 train_time:4061ms step_avg:123.08ms step:44/1393 train_time:4184ms step_avg:123.07ms step:45/1393 train_time:4307ms step_avg:123.05ms step:46/1393 train_time:4430ms step_avg:123.04ms step:47/1393 train_time:4552ms step_avg:123.03ms step:48/1393 train_time:4676ms step_avg:123.06ms step:49/1393 train_time:4798ms step_avg:123.03ms step:50/1393 train_time:4920ms step_avg:123.01ms step:51/1393 train_time:5043ms step_avg:123.00ms step:52/1393 train_time:5168ms step_avg:123.04ms step:53/1393 train_time:5290ms step_avg:123.03ms step:54/1393 train_time:5413ms step_avg:123.02ms step:55/1393 train_time:5538ms step_avg:123.07ms step:56/1393 train_time:5662ms step_avg:123.08ms step:57/1393 train_time:5785ms step_avg:123.08ms step:58/1393 train_time:5907ms step_avg:123.07ms step:59/1393 train_time:6031ms step_avg:123.07ms step:60/1393 train_time:6155ms step_avg:123.10ms step:61/1393 train_time:6277ms step_avg:123.09ms step:62/1393 train_time:6402ms step_avg:123.12ms step:63/1393 train_time:6526ms step_avg:123.13ms step:64/1393 train_time:6649ms step_avg:123.14ms step:65/1393 train_time:6773ms step_avg:123.14ms step:66/1393 train_time:6896ms step_avg:123.14ms step:67/1393 train_time:7019ms step_avg:123.13ms step:68/1393 train_time:7142ms step_avg:123.14ms step:69/1393 train_time:7264ms step_avg:123.13ms step:70/1393 train_time:7387ms step_avg:123.12ms step:71/1393 train_time:7509ms step_avg:123.10ms step:72/1393 train_time:7631ms step_avg:123.08ms step:73/1393 train_time:7755ms step_avg:123.10ms step:74/1393 train_time:7877ms step_avg:123.08ms step:75/1393 train_time:7999ms step_avg:123.06ms step:76/1393 train_time:8122ms step_avg:123.05ms step:77/1393 train_time:8244ms step_avg:123.04ms step:78/1393 train_time:8370ms step_avg:123.08ms step:79/1393 train_time:8493ms step_avg:123.09ms step:80/1393 train_time:8616ms step_avg:123.09ms step:81/1393 train_time:8739ms step_avg:123.08ms step:82/1393 train_time:8863ms step_avg:123.09ms step:83/1393 train_time:8985ms step_avg:123.09ms step:84/1393 train_time:9109ms step_avg:123.09ms step:85/1393 train_time:9232ms step_avg:123.09ms step:86/1393 train_time:9354ms step_avg:123.08ms step:87/1393 train_time:9476ms step_avg:123.07ms step:88/1393 train_time:9599ms step_avg:123.06ms step:89/1393 train_time:9724ms step_avg:123.09ms step:90/1393 train_time:9846ms step_avg:123.08ms step:91/1393 train_time:9968ms step_avg:123.06ms step:92/1393 train_time:10091ms step_avg:123.06ms step:93/1393 train_time:10215ms step_avg:123.07ms step:94/1393 train_time:10338ms step_avg:123.07ms step:95/1393 train_time:10461ms step_avg:123.07ms step:96/1393 train_time:10584ms step_avg:123.08ms step:97/1393 train_time:10708ms step_avg:123.08ms step:98/1393 train_time:10831ms step_avg:123.08ms step:99/1393 train_time:10952ms step_avg:123.06ms step:100/1393 train_time:11074ms step_avg:123.05ms step:101/1393 train_time:11196ms step_avg:123.04ms step:102/1393 train_time:11319ms step_avg:123.04ms step:103/1393 train_time:11442ms step_avg:123.04ms step:104/1393 train_time:11567ms step_avg:123.06ms step:105/1393 train_time:11690ms step_avg:123.05ms step:106/1393 train_time:11814ms step_avg:123.06ms step:107/1393 train_time:11938ms step_avg:123.07ms step:108/1393 train_time:12060ms step_avg:123.06ms step:109/1393 train_time:12184ms step_avg:123.07ms step:110/1393 train_time:12308ms step_avg:123.08ms step:111/1393 train_time:12431ms step_avg:123.07ms step:112/1393 train_time:12556ms step_avg:123.10ms step:113/1393 train_time:12680ms step_avg:123.11ms step:114/1393 train_time:12803ms step_avg:123.10ms step:115/1393 train_time:12927ms step_avg:123.11ms step:116/1393 train_time:13049ms step_avg:123.11ms step:117/1393 train_time:13172ms step_avg:123.11ms step:118/1393 train_time:13296ms step_avg:123.11ms step:119/1393 train_time:13419ms step_avg:123.11ms step:120/1393 train_time:13542ms step_avg:123.11ms step:121/1393 train_time:13665ms step_avg:123.11ms step:122/1393 train_time:13788ms step_avg:123.11ms step:123/1393 train_time:13912ms step_avg:123.12ms step:124/1393 train_time:14035ms step_avg:123.12ms step:125/1393 train_time:14159ms step_avg:123.12ms step:125/1393 val_loss:4.4043 train_time:14281ms step_avg:124.19ms step:126/1393 train_time:14299ms step_avg:123.27ms step:127/1393 train_time:14422ms step_avg:123.27ms step:128/1393 train_time:14550ms step_avg:123.30ms step:129/1393 train_time:14673ms step_avg:123.30ms step:130/1393 train_time:14796ms step_avg:123.30ms step:131/1393 train_time:14918ms step_avg:123.29ms step:132/1393 train_time:15040ms step_avg:123.28ms step:133/1393 train_time:15163ms step_avg:123.27ms step:134/1393 train_time:15285ms step_avg:123.27ms step:135/1393 train_time:15410ms step_avg:123.28ms step:136/1393 train_time:15534ms step_avg:123.29ms step:137/1393 train_time:15658ms step_avg:123.29ms step:138/1393 train_time:15781ms step_avg:123.29ms step:139/1393 train_time:15906ms step_avg:123.30ms step:140/1393 train_time:16029ms step_avg:123.30ms step:141/1393 train_time:16152ms step_avg:123.30ms step:142/1393 train_time:16275ms step_avg:123.30ms step:143/1393 train_time:16399ms step_avg:123.30ms step:144/1393 train_time:16524ms step_avg:123.31ms step:145/1393 train_time:16649ms step_avg:123.33ms step:146/1393 train_time:16772ms step_avg:123.32ms step:147/1393 train_time:16894ms step_avg:123.32ms step:148/1393 train_time:17017ms step_avg:123.31ms step:149/1393 train_time:17139ms step_avg:123.30ms step:150/1393 train_time:17262ms step_avg:123.30ms step:151/1393 train_time:17386ms step_avg:123.30ms step:152/1393 train_time:17509ms step_avg:123.30ms step:153/1393 train_time:17632ms step_avg:123.30ms step:154/1393 train_time:17755ms step_avg:123.30ms step:155/1393 train_time:17879ms step_avg:123.30ms step:156/1393 train_time:18003ms step_avg:123.31ms step:157/1393 train_time:18127ms step_avg:123.31ms step:158/1393 train_time:18249ms step_avg:123.31ms step:159/1393 train_time:18372ms step_avg:123.30ms step:160/1393 train_time:18495ms step_avg:123.30ms step:161/1393 train_time:18618ms step_avg:123.30ms step:162/1393 train_time:18741ms step_avg:123.30ms step:163/1393 train_time:18866ms step_avg:123.30ms step:164/1393 train_time:18988ms step_avg:123.30ms step:165/1393 train_time:19111ms step_avg:123.30ms step:166/1393 train_time:19235ms step_avg:123.30ms step:167/1393 train_time:19359ms step_avg:123.31ms step:168/1393 train_time:19483ms step_avg:123.31ms step:169/1393 train_time:19607ms step_avg:123.31ms step:170/1393 train_time:19731ms step_avg:123.32ms step:171/1393 train_time:19855ms step_avg:123.32ms step:172/1393 train_time:19978ms step_avg:123.32ms step:173/1393 train_time:20102ms step_avg:123.32ms step:174/1393 train_time:20225ms step_avg:123.33ms step:175/1393 train_time:20348ms step_avg:123.32ms step:176/1393 train_time:20472ms step_avg:123.33ms step:177/1393 train_time:20594ms step_avg:123.32ms step:178/1393 train_time:20717ms step_avg:123.32ms step:179/1393 train_time:20842ms step_avg:123.33ms step:180/1393 train_time:20967ms step_avg:123.33ms step:181/1393 train_time:21089ms step_avg:123.33ms step:182/1393 train_time:21212ms step_avg:123.32ms step:183/1393 train_time:21335ms step_avg:123.32ms step:184/1393 train_time:21458ms step_avg:123.32ms step:185/1393 train_time:21581ms step_avg:123.32ms step:186/1393 train_time:21706ms step_avg:123.33ms step:187/1393 train_time:21829ms step_avg:123.33ms step:188/1393 train_time:21952ms step_avg:123.32ms step:189/1393 train_time:22074ms step_avg:123.32ms step:190/1393 train_time:22198ms step_avg:123.32ms step:191/1393 train_time:22322ms step_avg:123.32ms step:192/1393 train_time:22445ms step_avg:123.32ms step:193/1393 train_time:22567ms step_avg:123.32ms step:194/1393 train_time:22691ms step_avg:123.32ms step:195/1393 train_time:22814ms step_avg:123.32ms step:196/1393 train_time:22938ms step_avg:123.32ms step:197/1393 train_time:23062ms step_avg:123.33ms step:198/1393 train_time:23187ms step_avg:123.33ms step:199/1393 train_time:23310ms step_avg:123.33ms step:200/1393 train_time:23434ms step_avg:123.34ms step:201/1393 train_time:23558ms step_avg:123.34ms step:202/1393 train_time:23681ms step_avg:123.34ms step:203/1393 train_time:23804ms step_avg:123.34ms step:204/1393 train_time:23928ms step_avg:123.34ms step:205/1393 train_time:24051ms step_avg:123.34ms step:206/1393 train_time:24175ms step_avg:123.34ms step:207/1393 train_time:24298ms step_avg:123.34ms step:208/1393 train_time:24422ms step_avg:123.34ms step:209/1393 train_time:24546ms step_avg:123.35ms step:210/1393 train_time:24669ms step_avg:123.35ms step:211/1393 train_time:24793ms step_avg:123.35ms step:212/1393 train_time:24918ms step_avg:123.35ms step:213/1393 train_time:25042ms step_avg:123.36ms step:214/1393 train_time:25167ms step_avg:123.37ms step:215/1393 train_time:25291ms step_avg:123.37ms step:216/1393 train_time:25415ms step_avg:123.37ms step:217/1393 train_time:25540ms step_avg:123.38ms step:218/1393 train_time:25665ms step_avg:123.39ms step:219/1393 train_time:25789ms step_avg:123.39ms step:220/1393 train_time:25914ms step_avg:123.40ms step:221/1393 train_time:26038ms step_avg:123.40ms step:222/1393 train_time:26162ms step_avg:123.41ms step:223/1393 train_time:26286ms step_avg:123.41ms step:224/1393 train_time:26410ms step_avg:123.41ms step:225/1393 train_time:26533ms step_avg:123.41ms step:226/1393 train_time:26656ms step_avg:123.41ms step:227/1393 train_time:26780ms step_avg:123.41ms step:228/1393 train_time:26905ms step_avg:123.42ms step:229/1393 train_time:27029ms step_avg:123.42ms step:230/1393 train_time:27153ms step_avg:123.42ms step:231/1393 train_time:27276ms step_avg:123.42ms step:232/1393 train_time:27400ms step_avg:123.43ms step:233/1393 train_time:27523ms step_avg:123.42ms step:234/1393 train_time:27646ms step_avg:123.42ms step:235/1393 train_time:27770ms step_avg:123.42ms step:236/1393 train_time:27894ms step_avg:123.43ms step:237/1393 train_time:28018ms step_avg:123.43ms step:238/1393 train_time:28143ms step_avg:123.43ms step:239/1393 train_time:28267ms step_avg:123.44ms step:240/1393 train_time:28390ms step_avg:123.44ms step:241/1393 train_time:28514ms step_avg:123.44ms step:242/1393 train_time:28640ms step_avg:123.45ms step:243/1393 train_time:28764ms step_avg:123.45ms step:244/1393 train_time:28887ms step_avg:123.45ms step:245/1393 train_time:29011ms step_avg:123.45ms step:246/1393 train_time:29135ms step_avg:123.45ms step:247/1393 train_time:29259ms step_avg:123.46ms step:248/1393 train_time:29383ms step_avg:123.46ms step:249/1393 train_time:29508ms step_avg:123.46ms step:250/1393 train_time:29631ms step_avg:123.46ms step:250/1393 val_loss:3.9863 train_time:29754ms step_avg:123.97ms step:251/1393 train_time:29772ms step_avg:123.53ms step:252/1393 train_time:29893ms step_avg:123.53ms step:253/1393 train_time:30018ms step_avg:123.53ms step:254/1393 train_time:30141ms step_avg:123.53ms step:255/1393 train_time:30264ms step_avg:123.53ms step:256/1393 train_time:30386ms step_avg:123.52ms step:257/1393 train_time:30509ms step_avg:123.52ms step:258/1393 train_time:30632ms step_avg:123.52ms step:259/1393 train_time:30755ms step_avg:123.51ms step:260/1393 train_time:30881ms step_avg:123.53ms step:261/1393 train_time:31007ms step_avg:123.53ms step:262/1393 train_time:31132ms step_avg:123.54ms step:263/1393 train_time:31255ms step_avg:123.54ms step:264/1393 train_time:31379ms step_avg:123.54ms step:265/1393 train_time:31502ms step_avg:123.54ms step:266/1393 train_time:31626ms step_avg:123.54ms step:267/1393 train_time:31750ms step_avg:123.54ms step:268/1393 train_time:31873ms step_avg:123.54ms step:269/1393 train_time:31997ms step_avg:123.54ms step:270/1393 train_time:32121ms step_avg:123.54ms step:271/1393 train_time:32245ms step_avg:123.55ms step:272/1393 train_time:32370ms step_avg:123.55ms step:273/1393 train_time:32493ms step_avg:123.55ms step:274/1393 train_time:32616ms step_avg:123.55ms step:275/1393 train_time:32742ms step_avg:123.55ms step:276/1393 train_time:32866ms step_avg:123.56ms step:277/1393 train_time:32989ms step_avg:123.56ms step:278/1393 train_time:33113ms step_avg:123.56ms step:279/1393 train_time:33236ms step_avg:123.56ms step:280/1393 train_time:33360ms step_avg:123.56ms step:281/1393 train_time:33484ms step_avg:123.56ms step:282/1393 train_time:33607ms step_avg:123.55ms step:283/1393 train_time:33731ms step_avg:123.56ms step:284/1393 train_time:33856ms step_avg:123.56ms step:285/1393 train_time:33981ms step_avg:123.57ms step:286/1393 train_time:34105ms step_avg:123.57ms step:287/1393 train_time:34228ms step_avg:123.57ms step:288/1393 train_time:34350ms step_avg:123.56ms step:289/1393 train_time:34475ms step_avg:123.57ms step:290/1393 train_time:34599ms step_avg:123.57ms step:291/1393 train_time:34722ms step_avg:123.57ms step:292/1393 train_time:34847ms step_avg:123.57ms step:293/1393 train_time:34971ms step_avg:123.57ms step:294/1393 train_time:35096ms step_avg:123.58ms step:295/1393 train_time:35220ms step_avg:123.58ms step:296/1393 train_time:35343ms step_avg:123.58ms step:297/1393 train_time:35467ms step_avg:123.58ms step:298/1393 train_time:35590ms step_avg:123.58ms step:299/1393 train_time:35714ms step_avg:123.58ms step:300/1393 train_time:35837ms step_avg:123.58ms step:301/1393 train_time:35960ms step_avg:123.57ms step:302/1393 train_time:36086ms step_avg:123.58ms step:303/1393 train_time:36210ms step_avg:123.58ms step:304/1393 train_time:36333ms step_avg:123.58ms step:305/1393 train_time:36458ms step_avg:123.59ms step:306/1393 train_time:36581ms step_avg:123.58ms step:307/1393 train_time:36705ms step_avg:123.59ms step:308/1393 train_time:36829ms step_avg:123.59ms step:309/1393 train_time:36953ms step_avg:123.59ms step:310/1393 train_time:37077ms step_avg:123.59ms step:311/1393 train_time:37202ms step_avg:123.59ms step:312/1393 train_time:37328ms step_avg:123.60ms step:313/1393 train_time:37453ms step_avg:123.61ms step:314/1393 train_time:37580ms step_avg:123.62ms step:315/1393 train_time:37706ms step_avg:123.63ms step:316/1393 train_time:37832ms step_avg:123.63ms step:317/1393 train_time:37958ms step_avg:123.64ms step:318/1393 train_time:38085ms step_avg:123.65ms step:319/1393 train_time:38212ms step_avg:123.66ms step:320/1393 train_time:38340ms step_avg:123.68ms step:321/1393 train_time:38467ms step_avg:123.69ms step:322/1393 train_time:38593ms step_avg:123.70ms step:323/1393 train_time:38719ms step_avg:123.70ms step:324/1393 train_time:38845ms step_avg:123.71ms step:325/1393 train_time:38971ms step_avg:123.72ms step:326/1393 train_time:39097ms step_avg:123.72ms step:327/1393 train_time:39224ms step_avg:123.73ms step:328/1393 train_time:39350ms step_avg:123.74ms step:329/1393 train_time:39477ms step_avg:123.75ms step:330/1393 train_time:39603ms step_avg:123.76ms step:331/1393 train_time:39729ms step_avg:123.77ms step:332/1393 train_time:39855ms step_avg:123.77ms step:333/1393 train_time:39981ms step_avg:123.78ms step:334/1393 train_time:40107ms step_avg:123.79ms step:335/1393 train_time:40234ms step_avg:123.80ms step:336/1393 train_time:40360ms step_avg:123.80ms step:337/1393 train_time:40486ms step_avg:123.81ms step:338/1393 train_time:40612ms step_avg:123.82ms step:339/1393 train_time:40740ms step_avg:123.83ms step:340/1393 train_time:40865ms step_avg:123.83ms step:341/1393 train_time:40993ms step_avg:123.85ms step:342/1393 train_time:41118ms step_avg:123.85ms step:343/1393 train_time:41245ms step_avg:123.86ms step:344/1393 train_time:41373ms step_avg:123.87ms step:345/1393 train_time:41499ms step_avg:123.88ms step:346/1393 train_time:41625ms step_avg:123.88ms step:347/1393 train_time:41752ms step_avg:123.89ms step:348/1393 train_time:41879ms step_avg:123.90ms step:349/1393 train_time:42005ms step_avg:123.91ms step:350/1393 train_time:42131ms step_avg:123.91ms step:351/1393 train_time:42258ms step_avg:123.92ms step:352/1393 train_time:42384ms step_avg:123.93ms step:353/1393 train_time:42511ms step_avg:123.94ms step:354/1393 train_time:42638ms step_avg:123.95ms step:355/1393 train_time:42765ms step_avg:123.96ms step:356/1393 train_time:42891ms step_avg:123.96ms step:357/1393 train_time:43018ms step_avg:123.97ms step:358/1393 train_time:43144ms step_avg:123.98ms step:359/1393 train_time:43271ms step_avg:123.99ms step:360/1393 train_time:43397ms step_avg:123.99ms step:361/1393 train_time:43523ms step_avg:124.00ms step:362/1393 train_time:43650ms step_avg:124.00ms step:363/1393 train_time:43777ms step_avg:124.01ms step:364/1393 train_time:43904ms step_avg:124.02ms step:365/1393 train_time:44030ms step_avg:124.03ms step:366/1393 train_time:44156ms step_avg:124.03ms step:367/1393 train_time:44283ms step_avg:124.04ms step:368/1393 train_time:44409ms step_avg:124.05ms step:369/1393 train_time:44535ms step_avg:124.05ms step:370/1393 train_time:44663ms step_avg:124.06ms step:371/1393 train_time:44788ms step_avg:124.07ms step:372/1393 train_time:44915ms step_avg:124.07ms step:373/1393 train_time:45042ms step_avg:124.08ms step:374/1393 train_time:45168ms step_avg:124.09ms step:375/1393 train_time:45294ms step_avg:124.09ms step:375/1393 val_loss:3.7856 train_time:45419ms step_avg:124.44ms step:376/1393 train_time:45437ms step_avg:124.15ms step:377/1393 train_time:45561ms step_avg:124.14ms step:378/1393 train_time:45689ms step_avg:124.16ms step:379/1393 train_time:45814ms step_avg:124.16ms step:380/1393 train_time:45940ms step_avg:124.16ms step:381/1393 train_time:46067ms step_avg:124.17ms step:382/1393 train_time:46192ms step_avg:124.17ms step:383/1393 train_time:46318ms step_avg:124.18ms step:384/1393 train_time:46444ms step_avg:124.18ms step:385/1393 train_time:46571ms step_avg:124.19ms step:386/1393 train_time:46697ms step_avg:124.20ms step:387/1393 train_time:46824ms step_avg:124.20ms step:388/1393 train_time:46950ms step_avg:124.21ms step:389/1393 train_time:47076ms step_avg:124.21ms step:390/1393 train_time:47202ms step_avg:124.22ms step:391/1393 train_time:47329ms step_avg:124.22ms step:392/1393 train_time:47456ms step_avg:124.23ms step:393/1393 train_time:47581ms step_avg:124.23ms step:394/1393 train_time:47707ms step_avg:124.24ms step:395/1393 train_time:47833ms step_avg:124.24ms step:396/1393 train_time:47959ms step_avg:124.25ms step:397/1393 train_time:48085ms step_avg:124.25ms step:398/1393 train_time:48211ms step_avg:124.25ms step:399/1393 train_time:48338ms step_avg:124.26ms step:400/1393 train_time:48465ms step_avg:124.27ms step:401/1393 train_time:48592ms step_avg:124.28ms step:402/1393 train_time:48718ms step_avg:124.28ms step:403/1393 train_time:48844ms step_avg:124.28ms step:404/1393 train_time:48970ms step_avg:124.29ms step:405/1393 train_time:49095ms step_avg:124.29ms step:406/1393 train_time:49221ms step_avg:124.29ms step:407/1393 train_time:49347ms step_avg:124.30ms step:408/1393 train_time:49473ms step_avg:124.30ms step:409/1393 train_time:49599ms step_avg:124.31ms step:410/1393 train_time:49727ms step_avg:124.32ms step:411/1393 train_time:49853ms step_avg:124.32ms step:412/1393 train_time:49980ms step_avg:124.33ms step:413/1393 train_time:50106ms step_avg:124.33ms step:414/1393 train_time:50232ms step_avg:124.34ms step:415/1393 train_time:50359ms step_avg:124.34ms step:416/1393 train_time:50485ms step_avg:124.35ms step:417/1393 train_time:50612ms step_avg:124.35ms step:418/1393 train_time:50738ms step_avg:124.36ms step:419/1393 train_time:50863ms step_avg:124.36ms step:420/1393 train_time:50990ms step_avg:124.37ms step:421/1393 train_time:51117ms step_avg:124.37ms step:422/1393 train_time:51245ms step_avg:124.38ms step:423/1393 train_time:51371ms step_avg:124.38ms step:424/1393 train_time:51498ms step_avg:124.39ms step:425/1393 train_time:51625ms step_avg:124.40ms step:426/1393 train_time:51752ms step_avg:124.40ms step:427/1393 train_time:51878ms step_avg:124.41ms step:428/1393 train_time:52005ms step_avg:124.41ms step:429/1393 train_time:52132ms step_avg:124.42ms step:430/1393 train_time:52259ms step_avg:124.43ms step:431/1393 train_time:52386ms step_avg:124.43ms step:432/1393 train_time:52513ms step_avg:124.44ms step:433/1393 train_time:52640ms step_avg:124.44ms step:434/1393 train_time:52766ms step_avg:124.45ms step:435/1393 train_time:52893ms step_avg:124.45ms step:436/1393 train_time:53019ms step_avg:124.46ms step:437/1393 train_time:53145ms step_avg:124.46ms step:438/1393 train_time:53271ms step_avg:124.47ms step:439/1393 train_time:53398ms step_avg:124.47ms step:440/1393 train_time:53525ms step_avg:124.48ms step:441/1393 train_time:53651ms step_avg:124.48ms step:442/1393 train_time:53778ms step_avg:124.49ms step:443/1393 train_time:53905ms step_avg:124.49ms step:444/1393 train_time:54031ms step_avg:124.50ms step:445/1393 train_time:54158ms step_avg:124.50ms step:446/1393 train_time:54284ms step_avg:124.51ms step:447/1393 train_time:54411ms step_avg:124.51ms step:448/1393 train_time:54538ms step_avg:124.52ms step:449/1393 train_time:54666ms step_avg:124.52ms step:450/1393 train_time:54793ms step_avg:124.53ms step:451/1393 train_time:54919ms step_avg:124.53ms step:452/1393 train_time:55046ms step_avg:124.54ms step:453/1393 train_time:55172ms step_avg:124.54ms step:454/1393 train_time:55300ms step_avg:124.55ms step:455/1393 train_time:55427ms step_avg:124.55ms step:456/1393 train_time:55553ms step_avg:124.56ms step:457/1393 train_time:55679ms step_avg:124.56ms step:458/1393 train_time:55808ms step_avg:124.57ms step:459/1393 train_time:55934ms step_avg:124.57ms step:460/1393 train_time:56060ms step_avg:124.58ms step:461/1393 train_time:56186ms step_avg:124.58ms step:462/1393 train_time:56313ms step_avg:124.59ms step:463/1393 train_time:56439ms step_avg:124.59ms step:464/1393 train_time:56565ms step_avg:124.59ms step:465/1393 train_time:56691ms step_avg:124.60ms step:466/1393 train_time:56818ms step_avg:124.60ms step:467/1393 train_time:56945ms step_avg:124.61ms step:468/1393 train_time:57073ms step_avg:124.61ms step:469/1393 train_time:57200ms step_avg:124.62ms step:470/1393 train_time:57327ms step_avg:124.62ms step:471/1393 train_time:57454ms step_avg:124.63ms step:472/1393 train_time:57581ms step_avg:124.63ms step:473/1393 train_time:57708ms step_avg:124.64ms step:474/1393 train_time:57834ms step_avg:124.64ms step:475/1393 train_time:57961ms step_avg:124.65ms step:476/1393 train_time:58087ms step_avg:124.65ms step:477/1393 train_time:58214ms step_avg:124.66ms step:478/1393 train_time:58340ms step_avg:124.66ms step:479/1393 train_time:58467ms step_avg:124.66ms step:480/1393 train_time:58594ms step_avg:124.67ms step:481/1393 train_time:58721ms step_avg:124.67ms step:482/1393 train_time:58848ms step_avg:124.68ms step:483/1393 train_time:58973ms step_avg:124.68ms step:484/1393 train_time:59100ms step_avg:124.68ms step:485/1393 train_time:59227ms step_avg:124.69ms step:486/1393 train_time:59353ms step_avg:124.69ms step:487/1393 train_time:59479ms step_avg:124.69ms step:488/1393 train_time:59607ms step_avg:124.70ms step:489/1393 train_time:59732ms step_avg:124.70ms step:490/1393 train_time:59859ms step_avg:124.71ms step:491/1393 train_time:59985ms step_avg:124.71ms step:492/1393 train_time:60111ms step_avg:124.71ms step:493/1393 train_time:60238ms step_avg:124.72ms step:494/1393 train_time:60365ms step_avg:124.72ms step:495/1393 train_time:60492ms step_avg:124.72ms step:496/1393 train_time:60617ms step_avg:124.73ms step:497/1393 train_time:60744ms step_avg:124.73ms step:498/1393 train_time:60871ms step_avg:124.74ms step:499/1393 train_time:60998ms step_avg:124.74ms step:500/1393 train_time:61124ms step_avg:124.74ms step:500/1393 val_loss:3.6643 train_time:61249ms step_avg:125.00ms step:501/1393 train_time:61267ms step_avg:124.78ms step:502/1393 train_time:61390ms step_avg:124.78ms step:503/1393 train_time:61519ms step_avg:124.79ms step:504/1393 train_time:61647ms step_avg:124.79ms step:505/1393 train_time:61772ms step_avg:124.79ms step:506/1393 train_time:61898ms step_avg:124.79ms step:507/1393 train_time:62023ms step_avg:124.80ms step:508/1393 train_time:62150ms step_avg:124.80ms step:509/1393 train_time:62277ms step_avg:124.80ms step:510/1393 train_time:62405ms step_avg:124.81ms step:511/1393 train_time:62532ms step_avg:124.81ms step:512/1393 train_time:62659ms step_avg:124.82ms step:513/1393 train_time:62786ms step_avg:124.82ms step:514/1393 train_time:62913ms step_avg:124.83ms step:515/1393 train_time:63039ms step_avg:124.83ms step:516/1393 train_time:63166ms step_avg:124.83ms step:517/1393 train_time:63293ms step_avg:124.84ms step:518/1393 train_time:63420ms step_avg:124.84ms step:519/1393 train_time:63548ms step_avg:124.85ms step:520/1393 train_time:63677ms step_avg:124.86ms step:521/1393 train_time:63806ms step_avg:124.87ms step:522/1393 train_time:63935ms step_avg:124.87ms step:523/1393 train_time:64064ms step_avg:124.88ms step:524/1393 train_time:64194ms step_avg:124.89ms step:525/1393 train_time:64322ms step_avg:124.90ms step:526/1393 train_time:64452ms step_avg:124.91ms step:527/1393 train_time:64580ms step_avg:124.91ms step:528/1393 train_time:64709ms step_avg:124.92ms step:529/1393 train_time:64838ms step_avg:124.93ms step:530/1393 train_time:64967ms step_avg:124.94ms step:531/1393 train_time:65095ms step_avg:124.94ms step:532/1393 train_time:65223ms step_avg:124.95ms step:533/1393 train_time:65352ms step_avg:124.96ms step:534/1393 train_time:65480ms step_avg:124.96ms step:535/1393 train_time:65611ms step_avg:124.97ms step:536/1393 train_time:65739ms step_avg:124.98ms step:537/1393 train_time:65868ms step_avg:124.99ms step:538/1393 train_time:65998ms step_avg:125.00ms step:539/1393 train_time:66127ms step_avg:125.00ms step:540/1393 train_time:66256ms step_avg:125.01ms step:541/1393 train_time:66384ms step_avg:125.02ms step:542/1393 train_time:66514ms step_avg:125.03ms step:543/1393 train_time:66643ms step_avg:125.03ms step:544/1393 train_time:66771ms step_avg:125.04ms step:545/1393 train_time:66900ms step_avg:125.05ms step:546/1393 train_time:67028ms step_avg:125.05ms step:547/1393 train_time:67157ms step_avg:125.06ms step:548/1393 train_time:67285ms step_avg:125.07ms step:549/1393 train_time:67414ms step_avg:125.07ms step:550/1393 train_time:67543ms step_avg:125.08ms step:551/1393 train_time:67672ms step_avg:125.09ms step:552/1393 train_time:67801ms step_avg:125.09ms step:553/1393 train_time:67931ms step_avg:125.10ms step:554/1393 train_time:68060ms step_avg:125.11ms step:555/1393 train_time:68189ms step_avg:125.12ms step:556/1393 train_time:68318ms step_avg:125.12ms step:557/1393 train_time:68446ms step_avg:125.13ms step:558/1393 train_time:68575ms step_avg:125.14ms step:559/1393 train_time:68704ms step_avg:125.14ms step:560/1393 train_time:68832ms step_avg:125.15ms step:561/1393 train_time:68961ms step_avg:125.16ms step:562/1393 train_time:69091ms step_avg:125.16ms step:563/1393 train_time:69219ms step_avg:125.17ms step:564/1393 train_time:69348ms step_avg:125.18ms step:565/1393 train_time:69477ms step_avg:125.18ms step:566/1393 train_time:69606ms step_avg:125.19ms step:567/1393 train_time:69736ms step_avg:125.20ms step:568/1393 train_time:69864ms step_avg:125.20ms step:569/1393 train_time:69996ms step_avg:125.22ms step:570/1393 train_time:70124ms step_avg:125.22ms step:571/1393 train_time:70252ms step_avg:125.23ms step:572/1393 train_time:70380ms step_avg:125.23ms step:573/1393 train_time:70508ms step_avg:125.24ms step:574/1393 train_time:70638ms step_avg:125.24ms step:575/1393 train_time:70767ms step_avg:125.25ms step:576/1393 train_time:70896ms step_avg:125.26ms step:577/1393 train_time:71025ms step_avg:125.26ms step:578/1393 train_time:71154ms step_avg:125.27ms step:579/1393 train_time:71281ms step_avg:125.27ms step:580/1393 train_time:71411ms step_avg:125.28ms step:581/1393 train_time:71540ms step_avg:125.29ms step:582/1393 train_time:71668ms step_avg:125.29ms step:583/1393 train_time:71798ms step_avg:125.30ms step:584/1393 train_time:71927ms step_avg:125.31ms step:585/1393 train_time:72056ms step_avg:125.32ms step:586/1393 train_time:72184ms step_avg:125.32ms step:587/1393 train_time:72313ms step_avg:125.33ms step:588/1393 train_time:72441ms step_avg:125.33ms step:589/1393 train_time:72571ms step_avg:125.34ms step:590/1393 train_time:72699ms step_avg:125.34ms step:591/1393 train_time:72827ms step_avg:125.35ms step:592/1393 train_time:72957ms step_avg:125.36ms step:593/1393 train_time:73087ms step_avg:125.36ms step:594/1393 train_time:73216ms step_avg:125.37ms step:595/1393 train_time:73344ms step_avg:125.38ms step:596/1393 train_time:73474ms step_avg:125.38ms step:597/1393 train_time:73603ms step_avg:125.39ms step:598/1393 train_time:73731ms step_avg:125.39ms step:599/1393 train_time:73860ms step_avg:125.40ms step:600/1393 train_time:73989ms step_avg:125.41ms step:601/1393 train_time:74119ms step_avg:125.41ms step:602/1393 train_time:74246ms step_avg:125.42ms step:603/1393 train_time:74376ms step_avg:125.42ms step:604/1393 train_time:74504ms step_avg:125.43ms step:605/1393 train_time:74633ms step_avg:125.43ms step:606/1393 train_time:74762ms step_avg:125.44ms step:607/1393 train_time:74891ms step_avg:125.45ms step:608/1393 train_time:75020ms step_avg:125.45ms step:609/1393 train_time:75149ms step_avg:125.46ms step:610/1393 train_time:75278ms step_avg:125.46ms step:611/1393 train_time:75407ms step_avg:125.47ms step:612/1393 train_time:75535ms step_avg:125.47ms step:613/1393 train_time:75664ms step_avg:125.48ms step:614/1393 train_time:75793ms step_avg:125.49ms step:615/1393 train_time:75921ms step_avg:125.49ms step:616/1393 train_time:76051ms step_avg:125.50ms step:617/1393 train_time:76179ms step_avg:125.50ms step:618/1393 train_time:76308ms step_avg:125.51ms step:619/1393 train_time:76438ms step_avg:125.51ms step:620/1393 train_time:76566ms step_avg:125.52ms step:621/1393 train_time:76695ms step_avg:125.52ms step:622/1393 train_time:76824ms step_avg:125.53ms step:623/1393 train_time:76953ms step_avg:125.53ms step:624/1393 train_time:77081ms step_avg:125.54ms step:625/1393 train_time:77211ms step_avg:125.55ms step:625/1393 val_loss:3.5831 train_time:77339ms step_avg:125.75ms step:626/1393 train_time:77356ms step_avg:125.58ms step:627/1393 train_time:77486ms step_avg:125.58ms step:628/1393 train_time:77616ms step_avg:125.59ms step:629/1393 train_time:77745ms step_avg:125.60ms step:630/1393 train_time:77873ms step_avg:125.60ms step:631/1393 train_time:78002ms step_avg:125.61ms step:632/1393 train_time:78130ms step_avg:125.61ms step:633/1393 train_time:78259ms step_avg:125.62ms step:634/1393 train_time:78388ms step_avg:125.62ms step:635/1393 train_time:78520ms step_avg:125.63ms step:636/1393 train_time:78649ms step_avg:125.64ms step:637/1393 train_time:78779ms step_avg:125.64ms step:638/1393 train_time:78908ms step_avg:125.65ms step:639/1393 train_time:79037ms step_avg:125.65ms step:640/1393 train_time:79166ms step_avg:125.66ms step:641/1393 train_time:79294ms step_avg:125.66ms step:642/1393 train_time:79424ms step_avg:125.67ms step:643/1393 train_time:79554ms step_avg:125.68ms step:644/1393 train_time:79683ms step_avg:125.68ms step:645/1393 train_time:79813ms step_avg:125.69ms step:646/1393 train_time:79944ms step_avg:125.70ms step:647/1393 train_time:80072ms step_avg:125.70ms step:648/1393 train_time:80201ms step_avg:125.71ms step:649/1393 train_time:80330ms step_avg:125.71ms step:650/1393 train_time:80460ms step_avg:125.72ms step:651/1393 train_time:80589ms step_avg:125.72ms step:652/1393 train_time:80718ms step_avg:125.73ms step:653/1393 train_time:80847ms step_avg:125.73ms step:654/1393 train_time:80976ms step_avg:125.74ms step:655/1393 train_time:81106ms step_avg:125.75ms step:656/1393 train_time:81235ms step_avg:125.75ms step:657/1393 train_time:81364ms step_avg:125.76ms step:658/1393 train_time:81493ms step_avg:125.76ms step:659/1393 train_time:81622ms step_avg:125.77ms step:660/1393 train_time:81750ms step_avg:125.77ms step:661/1393 train_time:81880ms step_avg:125.78ms step:662/1393 train_time:82010ms step_avg:125.78ms step:663/1393 train_time:82140ms step_avg:125.79ms step:664/1393 train_time:82269ms step_avg:125.79ms step:665/1393 train_time:82397ms step_avg:125.80ms step:666/1393 train_time:82526ms step_avg:125.80ms step:667/1393 train_time:82656ms step_avg:125.81ms step:668/1393 train_time:82785ms step_avg:125.81ms step:669/1393 train_time:82915ms step_avg:125.82ms step:670/1393 train_time:83044ms step_avg:125.82ms step:671/1393 train_time:83173ms step_avg:125.83ms step:672/1393 train_time:83304ms step_avg:125.84ms step:673/1393 train_time:83434ms step_avg:125.84ms step:674/1393 train_time:83564ms step_avg:125.85ms step:675/1393 train_time:83692ms step_avg:125.85ms step:676/1393 train_time:83821ms step_avg:125.86ms step:677/1393 train_time:83951ms step_avg:125.86ms step:678/1393 train_time:84080ms step_avg:125.87ms step:679/1393 train_time:84210ms step_avg:125.87ms step:680/1393 train_time:84341ms step_avg:125.88ms step:681/1393 train_time:84470ms step_avg:125.89ms step:682/1393 train_time:84600ms step_avg:125.89ms step:683/1393 train_time:84728ms step_avg:125.90ms step:684/1393 train_time:84858ms step_avg:125.90ms step:685/1393 train_time:84987ms step_avg:125.91ms step:686/1393 train_time:85116ms step_avg:125.91ms step:687/1393 train_time:85246ms step_avg:125.92ms step:688/1393 train_time:85374ms step_avg:125.92ms step:689/1393 train_time:85504ms step_avg:125.93ms step:690/1393 train_time:85634ms step_avg:125.93ms step:691/1393 train_time:85763ms step_avg:125.94ms step:692/1393 train_time:85892ms step_avg:125.94ms step:693/1393 train_time:86021ms step_avg:125.95ms step:694/1393 train_time:86150ms step_avg:125.95ms step:695/1393 train_time:86279ms step_avg:125.95ms step:696/1393 train_time:86409ms step_avg:125.96ms step:697/1393 train_time:86537ms step_avg:125.96ms step:698/1393 train_time:86667ms step_avg:125.97ms step:699/1393 train_time:86795ms step_avg:125.97ms step:700/1393 train_time:86926ms step_avg:125.98ms step:701/1393 train_time:87054ms step_avg:125.98ms step:702/1393 train_time:87183ms step_avg:125.99ms step:703/1393 train_time:87311ms step_avg:125.99ms step:704/1393 train_time:87441ms step_avg:126.00ms step:705/1393 train_time:87571ms step_avg:126.00ms step:706/1393 train_time:87701ms step_avg:126.01ms step:707/1393 train_time:87829ms step_avg:126.01ms step:708/1393 train_time:87960ms step_avg:126.02ms step:709/1393 train_time:88089ms step_avg:126.02ms step:710/1393 train_time:88218ms step_avg:126.03ms step:711/1393 train_time:88348ms step_avg:126.03ms step:712/1393 train_time:88477ms step_avg:126.04ms step:713/1393 train_time:88606ms step_avg:126.04ms step:714/1393 train_time:88735ms step_avg:126.04ms step:715/1393 train_time:88864ms step_avg:126.05ms step:716/1393 train_time:88993ms step_avg:126.05ms step:717/1393 train_time:89124ms step_avg:126.06ms step:718/1393 train_time:89254ms step_avg:126.06ms step:719/1393 train_time:89383ms step_avg:126.07ms step:720/1393 train_time:89512ms step_avg:126.07ms step:721/1393 train_time:89642ms step_avg:126.08ms step:722/1393 train_time:89771ms step_avg:126.08ms step:723/1393 train_time:89899ms step_avg:126.09ms step:724/1393 train_time:90029ms step_avg:126.09ms step:725/1393 train_time:90161ms step_avg:126.10ms step:726/1393 train_time:90293ms step_avg:126.11ms step:727/1393 train_time:90425ms step_avg:126.12ms step:728/1393 train_time:90556ms step_avg:126.12ms step:729/1393 train_time:90687ms step_avg:126.13ms step:730/1393 train_time:90818ms step_avg:126.14ms step:731/1393 train_time:90949ms step_avg:126.14ms step:732/1393 train_time:91079ms step_avg:126.15ms step:733/1393 train_time:91210ms step_avg:126.16ms step:734/1393 train_time:91341ms step_avg:126.16ms step:735/1393 train_time:91472ms step_avg:126.17ms step:736/1393 train_time:91603ms step_avg:126.17ms step:737/1393 train_time:91734ms step_avg:126.18ms step:738/1393 train_time:91864ms step_avg:126.19ms step:739/1393 train_time:91995ms step_avg:126.19ms step:740/1393 train_time:92126ms step_avg:126.20ms step:741/1393 train_time:92257ms step_avg:126.21ms step:742/1393 train_time:92389ms step_avg:126.21ms step:743/1393 train_time:92521ms step_avg:126.22ms step:744/1393 train_time:92653ms step_avg:126.23ms step:745/1393 train_time:92784ms step_avg:126.24ms step:746/1393 train_time:92915ms step_avg:126.24ms step:747/1393 train_time:93046ms step_avg:126.25ms step:748/1393 train_time:93178ms step_avg:126.26ms step:749/1393 train_time:93310ms step_avg:126.27ms step:750/1393 train_time:93442ms step_avg:126.27ms step:750/1393 val_loss:3.5286 train_time:93571ms step_avg:126.45ms step:751/1393 train_time:93589ms step_avg:126.30ms step:752/1393 train_time:93714ms step_avg:126.30ms step:753/1393 train_time:93847ms step_avg:126.31ms step:754/1393 train_time:93978ms step_avg:126.31ms step:755/1393 train_time:94108ms step_avg:126.32ms step:756/1393 train_time:94239ms step_avg:126.33ms step:757/1393 train_time:94371ms step_avg:126.33ms step:758/1393 train_time:94502ms step_avg:126.34ms step:759/1393 train_time:94634ms step_avg:126.35ms step:760/1393 train_time:94767ms step_avg:126.36ms step:761/1393 train_time:94897ms step_avg:126.36ms step:762/1393 train_time:95028ms step_avg:126.37ms step:763/1393 train_time:95158ms step_avg:126.37ms step:764/1393 train_time:95290ms step_avg:126.38ms step:765/1393 train_time:95420ms step_avg:126.38ms step:766/1393 train_time:95552ms step_avg:126.39ms step:767/1393 train_time:95683ms step_avg:126.40ms step:768/1393 train_time:95814ms step_avg:126.40ms step:769/1393 train_time:95945ms step_avg:126.41ms step:770/1393 train_time:96075ms step_avg:126.41ms step:771/1393 train_time:96206ms step_avg:126.42ms step:772/1393 train_time:96336ms step_avg:126.43ms step:773/1393 train_time:96468ms step_avg:126.43ms step:774/1393 train_time:96598ms step_avg:126.44ms step:775/1393 train_time:96730ms step_avg:126.44ms step:776/1393 train_time:96860ms step_avg:126.45ms step:777/1393 train_time:96992ms step_avg:126.46ms step:778/1393 train_time:97121ms step_avg:126.46ms step:779/1393 train_time:97253ms step_avg:126.47ms step:780/1393 train_time:97385ms step_avg:126.47ms step:781/1393 train_time:97517ms step_avg:126.48ms step:782/1393 train_time:97648ms step_avg:126.49ms step:783/1393 train_time:97781ms step_avg:126.50ms step:784/1393 train_time:97910ms step_avg:126.50ms step:785/1393 train_time:98041ms step_avg:126.50ms step:786/1393 train_time:98172ms step_avg:126.51ms step:787/1393 train_time:98303ms step_avg:126.52ms step:788/1393 train_time:98434ms step_avg:126.52ms step:789/1393 train_time:98565ms step_avg:126.53ms step:790/1393 train_time:98696ms step_avg:126.53ms step:791/1393 train_time:98826ms step_avg:126.54ms step:792/1393 train_time:98959ms step_avg:126.55ms step:793/1393 train_time:99090ms step_avg:126.55ms step:794/1393 train_time:99220ms step_avg:126.56ms step:795/1393 train_time:99353ms step_avg:126.56ms step:796/1393 train_time:99484ms step_avg:126.57ms step:797/1393 train_time:99615ms step_avg:126.58ms step:798/1393 train_time:99747ms step_avg:126.58ms step:799/1393 train_time:99878ms step_avg:126.59ms step:800/1393 train_time:100008ms step_avg:126.59ms step:801/1393 train_time:100139ms step_avg:126.60ms step:802/1393 train_time:100270ms step_avg:126.60ms step:803/1393 train_time:100401ms step_avg:126.61ms step:804/1393 train_time:100531ms step_avg:126.61ms step:805/1393 train_time:100663ms step_avg:126.62ms step:806/1393 train_time:100794ms step_avg:126.63ms step:807/1393 train_time:100925ms step_avg:126.63ms step:808/1393 train_time:101056ms step_avg:126.64ms step:809/1393 train_time:101187ms step_avg:126.64ms step:810/1393 train_time:101319ms step_avg:126.65ms step:811/1393 train_time:101450ms step_avg:126.65ms step:812/1393 train_time:101583ms step_avg:126.66ms step:813/1393 train_time:101713ms step_avg:126.67ms step:814/1393 train_time:101844ms step_avg:126.67ms step:815/1393 train_time:101975ms step_avg:126.68ms step:816/1393 train_time:102107ms step_avg:126.68ms step:817/1393 train_time:102238ms step_avg:126.69ms step:818/1393 train_time:102369ms step_avg:126.69ms step:819/1393 train_time:102501ms step_avg:126.70ms step:820/1393 train_time:102632ms step_avg:126.71ms step:821/1393 train_time:102763ms step_avg:126.71ms step:822/1393 train_time:102895ms step_avg:126.72ms step:823/1393 train_time:103025ms step_avg:126.72ms step:824/1393 train_time:103156ms step_avg:126.73ms step:825/1393 train_time:103287ms step_avg:126.73ms step:826/1393 train_time:103420ms step_avg:126.74ms step:827/1393 train_time:103551ms step_avg:126.75ms step:828/1393 train_time:103682ms step_avg:126.75ms step:829/1393 train_time:103813ms step_avg:126.76ms step:830/1393 train_time:103944ms step_avg:126.76ms step:831/1393 train_time:104076ms step_avg:126.77ms step:832/1393 train_time:104207ms step_avg:126.77ms step:833/1393 train_time:104338ms step_avg:126.78ms step:834/1393 train_time:104471ms step_avg:126.79ms step:835/1393 train_time:104602ms step_avg:126.79ms step:836/1393 train_time:104735ms step_avg:126.80ms step:837/1393 train_time:104866ms step_avg:126.80ms step:838/1393 train_time:104997ms step_avg:126.81ms step:839/1393 train_time:105127ms step_avg:126.81ms step:840/1393 train_time:105258ms step_avg:126.82ms step:841/1393 train_time:105389ms step_avg:126.82ms step:842/1393 train_time:105520ms step_avg:126.83ms step:843/1393 train_time:105652ms step_avg:126.83ms step:844/1393 train_time:105783ms step_avg:126.84ms step:845/1393 train_time:105914ms step_avg:126.84ms step:846/1393 train_time:106045ms step_avg:126.85ms step:847/1393 train_time:106176ms step_avg:126.85ms step:848/1393 train_time:106307ms step_avg:126.86ms step:849/1393 train_time:106440ms step_avg:126.87ms step:850/1393 train_time:106571ms step_avg:126.87ms step:851/1393 train_time:106704ms step_avg:126.88ms step:852/1393 train_time:106835ms step_avg:126.88ms step:853/1393 train_time:106966ms step_avg:126.89ms step:854/1393 train_time:107097ms step_avg:126.89ms step:855/1393 train_time:107228ms step_avg:126.90ms step:856/1393 train_time:107359ms step_avg:126.90ms step:857/1393 train_time:107490ms step_avg:126.91ms step:858/1393 train_time:107622ms step_avg:126.91ms step:859/1393 train_time:107754ms step_avg:126.92ms step:860/1393 train_time:107884ms step_avg:126.92ms step:861/1393 train_time:108017ms step_avg:126.93ms step:862/1393 train_time:108148ms step_avg:126.93ms step:863/1393 train_time:108280ms step_avg:126.94ms step:864/1393 train_time:108410ms step_avg:126.94ms step:865/1393 train_time:108543ms step_avg:126.95ms step:866/1393 train_time:108676ms step_avg:126.96ms step:867/1393 train_time:108806ms step_avg:126.96ms step:868/1393 train_time:108938ms step_avg:126.97ms step:869/1393 train_time:109069ms step_avg:126.97ms step:870/1393 train_time:109201ms step_avg:126.98ms step:871/1393 train_time:109334ms step_avg:126.98ms step:872/1393 train_time:109466ms step_avg:126.99ms step:873/1393 train_time:109598ms step_avg:127.00ms step:874/1393 train_time:109730ms step_avg:127.00ms step:875/1393 train_time:109861ms step_avg:127.01ms step:875/1393 val_loss:3.4772 train_time:109992ms step_avg:127.16ms step:876/1393 train_time:110010ms step_avg:127.03ms step:877/1393 train_time:110136ms step_avg:127.03ms step:878/1393 train_time:110268ms step_avg:127.04ms step:879/1393 train_time:110399ms step_avg:127.04ms step:880/1393 train_time:110530ms step_avg:127.05ms step:881/1393 train_time:110661ms step_avg:127.05ms step:882/1393 train_time:110792ms step_avg:127.05ms step:883/1393 train_time:110922ms step_avg:127.06ms step:884/1393 train_time:111053ms step_avg:127.06ms step:885/1393 train_time:111186ms step_avg:127.07ms step:886/1393 train_time:111319ms step_avg:127.08ms step:887/1393 train_time:111449ms step_avg:127.08ms step:888/1393 train_time:111581ms step_avg:127.09ms step:889/1393 train_time:111712ms step_avg:127.09ms step:890/1393 train_time:111842ms step_avg:127.09ms step:891/1393 train_time:111974ms step_avg:127.10ms step:892/1393 train_time:112105ms step_avg:127.10ms step:893/1393 train_time:112236ms step_avg:127.11ms step:894/1393 train_time:112367ms step_avg:127.11ms step:895/1393 train_time:112500ms step_avg:127.12ms step:896/1393 train_time:112631ms step_avg:127.12ms step:897/1393 train_time:112762ms step_avg:127.13ms step:898/1393 train_time:112893ms step_avg:127.13ms step:899/1393 train_time:113026ms step_avg:127.14ms step:900/1393 train_time:113157ms step_avg:127.14ms step:901/1393 train_time:113288ms step_avg:127.15ms step:902/1393 train_time:113420ms step_avg:127.15ms step:903/1393 train_time:113551ms step_avg:127.16ms step:904/1393 train_time:113684ms step_avg:127.16ms step:905/1393 train_time:113815ms step_avg:127.17ms step:906/1393 train_time:113946ms step_avg:127.17ms step:907/1393 train_time:114080ms step_avg:127.18ms step:908/1393 train_time:114210ms step_avg:127.18ms step:909/1393 train_time:114342ms step_avg:127.19ms step:910/1393 train_time:114473ms step_avg:127.19ms step:911/1393 train_time:114605ms step_avg:127.20ms step:912/1393 train_time:114736ms step_avg:127.20ms step:913/1393 train_time:114868ms step_avg:127.21ms step:914/1393 train_time:114999ms step_avg:127.21ms step:915/1393 train_time:115131ms step_avg:127.22ms step:916/1393 train_time:115264ms step_avg:127.22ms step:917/1393 train_time:115396ms step_avg:127.23ms step:918/1393 train_time:115528ms step_avg:127.23ms step:919/1393 train_time:115661ms step_avg:127.24ms step:920/1393 train_time:115793ms step_avg:127.25ms step:921/1393 train_time:115923ms step_avg:127.25ms step:922/1393 train_time:116055ms step_avg:127.25ms step:923/1393 train_time:116186ms step_avg:127.26ms step:924/1393 train_time:116316ms step_avg:127.26ms step:925/1393 train_time:116448ms step_avg:127.27ms step:926/1393 train_time:116580ms step_avg:127.27ms step:927/1393 train_time:116712ms step_avg:127.28ms step:928/1393 train_time:116842ms step_avg:127.28ms step:929/1393 train_time:116974ms step_avg:127.28ms step:930/1393 train_time:117105ms step_avg:127.29ms step:931/1393 train_time:117239ms step_avg:127.29ms step:932/1393 train_time:117370ms step_avg:127.30ms step:933/1393 train_time:117504ms step_avg:127.31ms step:934/1393 train_time:117639ms step_avg:127.31ms step:935/1393 train_time:117772ms step_avg:127.32ms step:936/1393 train_time:117905ms step_avg:127.33ms step:937/1393 train_time:118039ms step_avg:127.33ms step:938/1393 train_time:118172ms step_avg:127.34ms step:939/1393 train_time:118304ms step_avg:127.35ms step:940/1393 train_time:118438ms step_avg:127.35ms step:941/1393 train_time:118570ms step_avg:127.36ms step:942/1393 train_time:118704ms step_avg:127.36ms step:943/1393 train_time:118839ms step_avg:127.37ms step:944/1393 train_time:118973ms step_avg:127.38ms step:945/1393 train_time:119107ms step_avg:127.39ms step:946/1393 train_time:119239ms step_avg:127.39ms step:947/1393 train_time:119372ms step_avg:127.40ms step:948/1393 train_time:119505ms step_avg:127.40ms step:949/1393 train_time:119638ms step_avg:127.41ms step:950/1393 train_time:119772ms step_avg:127.42ms step:951/1393 train_time:119907ms step_avg:127.43ms step:952/1393 train_time:120040ms step_avg:127.43ms step:953/1393 train_time:120173ms step_avg:127.44ms step:954/1393 train_time:120306ms step_avg:127.44ms step:955/1393 train_time:120439ms step_avg:127.45ms step:956/1393 train_time:120574ms step_avg:127.46ms step:957/1393 train_time:120705ms step_avg:127.46ms step:958/1393 train_time:120839ms step_avg:127.47ms step:959/1393 train_time:120972ms step_avg:127.47ms step:960/1393 train_time:121104ms step_avg:127.48ms step:961/1393 train_time:121237ms step_avg:127.48ms step:962/1393 train_time:121370ms step_avg:127.49ms step:963/1393 train_time:121505ms step_avg:127.50ms step:964/1393 train_time:121637ms step_avg:127.50ms step:965/1393 train_time:121768ms step_avg:127.51ms step:966/1393 train_time:121901ms step_avg:127.51ms step:967/1393 train_time:122036ms step_avg:127.52ms step:968/1393 train_time:122169ms step_avg:127.53ms step:969/1393 train_time:122303ms step_avg:127.53ms step:970/1393 train_time:122436ms step_avg:127.54ms step:971/1393 train_time:122569ms step_avg:127.54ms step:972/1393 train_time:122702ms step_avg:127.55ms step:973/1393 train_time:122835ms step_avg:127.55ms step:974/1393 train_time:122968ms step_avg:127.56ms step:975/1393 train_time:123101ms step_avg:127.57ms step:976/1393 train_time:123234ms step_avg:127.57ms step:977/1393 train_time:123367ms step_avg:127.58ms step:978/1393 train_time:123500ms step_avg:127.58ms step:979/1393 train_time:123634ms step_avg:127.59ms step:980/1393 train_time:123766ms step_avg:127.59ms step:981/1393 train_time:123899ms step_avg:127.60ms step:982/1393 train_time:124032ms step_avg:127.61ms step:983/1393 train_time:124165ms step_avg:127.61ms step:984/1393 train_time:124299ms step_avg:127.62ms step:985/1393 train_time:124432ms step_avg:127.62ms step:986/1393 train_time:124565ms step_avg:127.63ms step:987/1393 train_time:124699ms step_avg:127.63ms step:988/1393 train_time:124830ms step_avg:127.64ms step:989/1393 train_time:124963ms step_avg:127.64ms step:990/1393 train_time:125099ms step_avg:127.65ms step:991/1393 train_time:125230ms step_avg:127.66ms step:992/1393 train_time:125365ms step_avg:127.66ms step:993/1393 train_time:125502ms step_avg:127.67ms step:994/1393 train_time:125633ms step_avg:127.68ms step:995/1393 train_time:125765ms step_avg:127.68ms step:996/1393 train_time:125897ms step_avg:127.69ms step:997/1393 train_time:126030ms step_avg:127.69ms step:998/1393 train_time:126162ms step_avg:127.69ms step:999/1393 train_time:126295ms step_avg:127.70ms step:1000/1393 train_time:126428ms step_avg:127.71ms step:1000/1393 val_loss:3.4151 train_time:126559ms step_avg:127.84ms step:1001/1393 train_time:126577ms step_avg:127.73ms step:1002/1393 train_time:126702ms step_avg:127.72ms step:1003/1393 train_time:126839ms step_avg:127.73ms step:1004/1393 train_time:126972ms step_avg:127.74ms step:1005/1393 train_time:127105ms step_avg:127.74ms step:1006/1393 train_time:127237ms step_avg:127.75ms step:1007/1393 train_time:127369ms step_avg:127.75ms step:1008/1393 train_time:127502ms step_avg:127.76ms step:1009/1393 train_time:127636ms step_avg:127.76ms step:1010/1393 train_time:127769ms step_avg:127.77ms step:1011/1393 train_time:127905ms step_avg:127.78ms step:1012/1393 train_time:128038ms step_avg:127.78ms step:1013/1393 train_time:128171ms step_avg:127.79ms step:1014/1393 train_time:128304ms step_avg:127.79ms step:1015/1393 train_time:128437ms step_avg:127.80ms step:1016/1393 train_time:128569ms step_avg:127.80ms step:1017/1393 train_time:128703ms step_avg:127.81ms step:1018/1393 train_time:128837ms step_avg:127.81ms step:1019/1393 train_time:128971ms step_avg:127.82ms step:1020/1393 train_time:129103ms step_avg:127.83ms step:1021/1393 train_time:129237ms step_avg:127.83ms step:1022/1393 train_time:129368ms step_avg:127.83ms step:1023/1393 train_time:129502ms step_avg:127.84ms step:1024/1393 train_time:129636ms step_avg:127.85ms step:1025/1393 train_time:129769ms step_avg:127.85ms step:1026/1393 train_time:129902ms step_avg:127.86ms step:1027/1393 train_time:130034ms step_avg:127.86ms step:1028/1393 train_time:130169ms step_avg:127.87ms step:1029/1393 train_time:130303ms step_avg:127.87ms step:1030/1393 train_time:130436ms step_avg:127.88ms step:1031/1393 train_time:130568ms step_avg:127.88ms step:1032/1393 train_time:130700ms step_avg:127.89ms step:1033/1393 train_time:130834ms step_avg:127.89ms step:1034/1393 train_time:130969ms step_avg:127.90ms step:1035/1393 train_time:131103ms step_avg:127.91ms step:1036/1393 train_time:131236ms step_avg:127.91ms step:1037/1393 train_time:131370ms step_avg:127.92ms step:1038/1393 train_time:131503ms step_avg:127.92ms step:1039/1393 train_time:131635ms step_avg:127.92ms step:1040/1393 train_time:131767ms step_avg:127.93ms step:1041/1393 train_time:131902ms step_avg:127.94ms step:1042/1393 train_time:132036ms step_avg:127.94ms step:1043/1393 train_time:132168ms step_avg:127.95ms step:1044/1393 train_time:132304ms step_avg:127.95ms step:1045/1393 train_time:132439ms step_avg:127.96ms step:1046/1393 train_time:132571ms step_avg:127.96ms step:1047/1393 train_time:132704ms step_avg:127.97ms step:1048/1393 train_time:132838ms step_avg:127.97ms step:1049/1393 train_time:132972ms step_avg:127.98ms step:1050/1393 train_time:133104ms step_avg:127.98ms step:1051/1393 train_time:133238ms step_avg:127.99ms step:1052/1393 train_time:133371ms step_avg:128.00ms step:1053/1393 train_time:133504ms step_avg:128.00ms step:1054/1393 train_time:133636ms step_avg:128.00ms step:1055/1393 train_time:133768ms step_avg:128.01ms step:1056/1393 train_time:133901ms step_avg:128.01ms step:1057/1393 train_time:134034ms step_avg:128.02ms step:1058/1393 train_time:134168ms step_avg:128.02ms step:1059/1393 train_time:134302ms step_avg:128.03ms step:1060/1393 train_time:134435ms step_avg:128.03ms step:1061/1393 train_time:134567ms step_avg:128.04ms step:1062/1393 train_time:134702ms step_avg:128.04ms step:1063/1393 train_time:134835ms step_avg:128.05ms step:1064/1393 train_time:134967ms step_avg:128.05ms step:1065/1393 train_time:135101ms step_avg:128.06ms step:1066/1393 train_time:135234ms step_avg:128.06ms step:1067/1393 train_time:135368ms step_avg:128.07ms step:1068/1393 train_time:135501ms step_avg:128.07ms step:1069/1393 train_time:135634ms step_avg:128.08ms step:1070/1393 train_time:135767ms step_avg:128.08ms step:1071/1393 train_time:135903ms step_avg:128.09ms step:1072/1393 train_time:136034ms step_avg:128.09ms step:1073/1393 train_time:136168ms step_avg:128.10ms step:1074/1393 train_time:136301ms step_avg:128.10ms step:1075/1393 train_time:136434ms step_avg:128.11ms step:1076/1393 train_time:136568ms step_avg:128.11ms step:1077/1393 train_time:136702ms step_avg:128.12ms step:1078/1393 train_time:136835ms step_avg:128.12ms step:1079/1393 train_time:136970ms step_avg:128.13ms step:1080/1393 train_time:137104ms step_avg:128.13ms step:1081/1393 train_time:137236ms step_avg:128.14ms step:1082/1393 train_time:137368ms step_avg:128.14ms step:1083/1393 train_time:137501ms step_avg:128.15ms step:1084/1393 train_time:137635ms step_avg:128.15ms step:1085/1393 train_time:137768ms step_avg:128.16ms step:1086/1393 train_time:137902ms step_avg:128.16ms step:1087/1393 train_time:138036ms step_avg:128.17ms step:1088/1393 train_time:138168ms step_avg:128.17ms step:1089/1393 train_time:138303ms step_avg:128.18ms step:1090/1393 train_time:138437ms step_avg:128.18ms step:1091/1393 train_time:138570ms step_avg:128.19ms step:1092/1393 train_time:138703ms step_avg:128.19ms step:1093/1393 train_time:138837ms step_avg:128.20ms step:1094/1393 train_time:138970ms step_avg:128.20ms step:1095/1393 train_time:139103ms step_avg:128.21ms step:1096/1393 train_time:139238ms step_avg:128.21ms step:1097/1393 train_time:139371ms step_avg:128.22ms step:1098/1393 train_time:139505ms step_avg:128.22ms step:1099/1393 train_time:139639ms step_avg:128.23ms step:1100/1393 train_time:139771ms step_avg:128.23ms step:1101/1393 train_time:139906ms step_avg:128.24ms step:1102/1393 train_time:140039ms step_avg:128.24ms step:1103/1393 train_time:140172ms step_avg:128.24ms step:1104/1393 train_time:140305ms step_avg:128.25ms step:1105/1393 train_time:140440ms step_avg:128.26ms step:1106/1393 train_time:140574ms step_avg:128.26ms step:1107/1393 train_time:140707ms step_avg:128.27ms step:1108/1393 train_time:140844ms step_avg:128.27ms step:1109/1393 train_time:140977ms step_avg:128.28ms step:1110/1393 train_time:141110ms step_avg:128.28ms step:1111/1393 train_time:141244ms step_avg:128.29ms step:1112/1393 train_time:141378ms step_avg:128.29ms step:1113/1393 train_time:141510ms step_avg:128.30ms step:1114/1393 train_time:141643ms step_avg:128.30ms step:1115/1393 train_time:141777ms step_avg:128.31ms step:1116/1393 train_time:141910ms step_avg:128.31ms step:1117/1393 train_time:142043ms step_avg:128.31ms step:1118/1393 train_time:142179ms step_avg:128.32ms step:1119/1393 train_time:142312ms step_avg:128.33ms step:1120/1393 train_time:142445ms step_avg:128.33ms step:1121/1393 train_time:142578ms step_avg:128.33ms step:1122/1393 train_time:142711ms step_avg:128.34ms step:1123/1393 train_time:142844ms step_avg:128.34ms step:1124/1393 train_time:142977ms step_avg:128.35ms step:1125/1393 train_time:143110ms step_avg:128.35ms step:1125/1393 val_loss:3.3637 train_time:143242ms step_avg:128.47ms step:1126/1393 train_time:143260ms step_avg:128.37ms step:1127/1393 train_time:143387ms step_avg:128.37ms step:1128/1393 train_time:143520ms step_avg:128.37ms step:1129/1393 train_time:143654ms step_avg:128.38ms step:1130/1393 train_time:143786ms step_avg:128.38ms step:1131/1393 train_time:143918ms step_avg:128.38ms step:1132/1393 train_time:144050ms step_avg:128.39ms step:1133/1393 train_time:144181ms step_avg:128.39ms step:1134/1393 train_time:144316ms step_avg:128.40ms step:1135/1393 train_time:144451ms step_avg:128.40ms step:1136/1393 train_time:144588ms step_avg:128.41ms step:1137/1393 train_time:144721ms step_avg:128.41ms step:1138/1393 train_time:144858ms step_avg:128.42ms step:1139/1393 train_time:144992ms step_avg:128.42ms step:1140/1393 train_time:145127ms step_avg:128.43ms step:1141/1393 train_time:145262ms step_avg:128.44ms step:1142/1393 train_time:145396ms step_avg:128.44ms step:1143/1393 train_time:145532ms step_avg:128.45ms step:1144/1393 train_time:145666ms step_avg:128.45ms step:1145/1393 train_time:145802ms step_avg:128.46ms step:1146/1393 train_time:145936ms step_avg:128.46ms step:1147/1393 train_time:146069ms step_avg:128.47ms step:1148/1393 train_time:146205ms step_avg:128.48ms step:1149/1393 train_time:146338ms step_avg:128.48ms step:1150/1393 train_time:146472ms step_avg:128.48ms step:1151/1393 train_time:146609ms step_avg:128.49ms step:1152/1393 train_time:146744ms step_avg:128.50ms step:1153/1393 train_time:146880ms step_avg:128.50ms step:1154/1393 train_time:147014ms step_avg:128.51ms step:1155/1393 train_time:147149ms step_avg:128.51ms step:1156/1393 train_time:147288ms step_avg:128.52ms step:1157/1393 train_time:147423ms step_avg:128.53ms step:1158/1393 train_time:147558ms step_avg:128.53ms step:1159/1393 train_time:147691ms step_avg:128.54ms step:1160/1393 train_time:147826ms step_avg:128.54ms step:1161/1393 train_time:147960ms step_avg:128.55ms step:1162/1393 train_time:148095ms step_avg:128.55ms step:1163/1393 train_time:148229ms step_avg:128.56ms step:1164/1393 train_time:148364ms step_avg:128.57ms step:1165/1393 train_time:148498ms step_avg:128.57ms step:1166/1393 train_time:148633ms step_avg:128.57ms step:1167/1393 train_time:148767ms step_avg:128.58ms step:1168/1393 train_time:148903ms step_avg:128.59ms step:1169/1393 train_time:149038ms step_avg:128.59ms step:1170/1393 train_time:149171ms step_avg:128.60ms step:1171/1393 train_time:149305ms step_avg:128.60ms step:1172/1393 train_time:149440ms step_avg:128.61ms step:1173/1393 train_time:149574ms step_avg:128.61ms step:1174/1393 train_time:149714ms step_avg:128.62ms step:1175/1393 train_time:149850ms step_avg:128.63ms step:1176/1393 train_time:149985ms step_avg:128.63ms step:1177/1393 train_time:150124ms step_avg:128.64ms step:1178/1393 train_time:150258ms step_avg:128.65ms step:1179/1393 train_time:150392ms step_avg:128.65ms step:1180/1393 train_time:150530ms step_avg:128.66ms step:1181/1393 train_time:150666ms step_avg:128.66ms step:1182/1393 train_time:150801ms step_avg:128.67ms step:1183/1393 train_time:150935ms step_avg:128.67ms step:1184/1393 train_time:151070ms step_avg:128.68ms step:1185/1393 train_time:151205ms step_avg:128.69ms step:1186/1393 train_time:151342ms step_avg:128.69ms step:1187/1393 train_time:151482ms step_avg:128.70ms step:1188/1393 train_time:151616ms step_avg:128.71ms step:1189/1393 train_time:151750ms step_avg:128.71ms step:1190/1393 train_time:151883ms step_avg:128.71ms step:1191/1393 train_time:152018ms step_avg:128.72ms step:1192/1393 train_time:152154ms step_avg:128.73ms step:1193/1393 train_time:152288ms step_avg:128.73ms step:1194/1393 train_time:152424ms step_avg:128.74ms step:1195/1393 train_time:152558ms step_avg:128.74ms step:1196/1393 train_time:152694ms step_avg:128.75ms step:1197/1393 train_time:152829ms step_avg:128.75ms step:1198/1393 train_time:152966ms step_avg:128.76ms step:1199/1393 train_time:153100ms step_avg:128.76ms step:1200/1393 train_time:153234ms step_avg:128.77ms step:1201/1393 train_time:153367ms step_avg:128.77ms step:1202/1393 train_time:153507ms step_avg:128.78ms step:1203/1393 train_time:153644ms step_avg:128.79ms step:1204/1393 train_time:153777ms step_avg:128.79ms step:1205/1393 train_time:153913ms step_avg:128.80ms step:1206/1393 train_time:154050ms step_avg:128.80ms step:1207/1393 train_time:154183ms step_avg:128.81ms step:1208/1393 train_time:154318ms step_avg:128.81ms step:1209/1393 train_time:154452ms step_avg:128.82ms step:1210/1393 train_time:154590ms step_avg:128.82ms step:1211/1393 train_time:154726ms step_avg:128.83ms step:1212/1393 train_time:154862ms step_avg:128.84ms step:1213/1393 train_time:154997ms step_avg:128.84ms step:1214/1393 train_time:155130ms step_avg:128.85ms step:1215/1393 train_time:155267ms step_avg:128.85ms step:1216/1393 train_time:155400ms step_avg:128.86ms step:1217/1393 train_time:155537ms step_avg:128.86ms step:1218/1393 train_time:155670ms step_avg:128.87ms step:1219/1393 train_time:155804ms step_avg:128.87ms step:1220/1393 train_time:155939ms step_avg:128.88ms step:1221/1393 train_time:156072ms step_avg:128.88ms step:1222/1393 train_time:156207ms step_avg:128.88ms step:1223/1393 train_time:156341ms step_avg:128.89ms step:1224/1393 train_time:156478ms step_avg:128.89ms step:1225/1393 train_time:156614ms step_avg:128.90ms step:1226/1393 train_time:156747ms step_avg:128.90ms step:1227/1393 train_time:156882ms step_avg:128.91ms step:1228/1393 train_time:157016ms step_avg:128.91ms step:1229/1393 train_time:157149ms step_avg:128.92ms step:1230/1393 train_time:157287ms step_avg:128.92ms step:1231/1393 train_time:157422ms step_avg:128.93ms step:1232/1393 train_time:157559ms step_avg:128.94ms step:1233/1393 train_time:157692ms step_avg:128.94ms step:1234/1393 train_time:157827ms step_avg:128.94ms step:1235/1393 train_time:157964ms step_avg:128.95ms step:1236/1393 train_time:158100ms step_avg:128.96ms step:1237/1393 train_time:158235ms step_avg:128.96ms step:1238/1393 train_time:158374ms step_avg:128.97ms step:1239/1393 train_time:158506ms step_avg:128.97ms step:1240/1393 train_time:158643ms step_avg:128.98ms step:1241/1393 train_time:158780ms step_avg:128.98ms step:1242/1393 train_time:158913ms step_avg:128.99ms step:1243/1393 train_time:159048ms step_avg:128.99ms step:1244/1393 train_time:159184ms step_avg:129.00ms step:1245/1393 train_time:159320ms step_avg:129.00ms step:1246/1393 train_time:159454ms step_avg:129.01ms step:1247/1393 train_time:159589ms step_avg:129.01ms step:1248/1393 train_time:159724ms step_avg:129.02ms step:1249/1393 train_time:159857ms step_avg:129.02ms step:1250/1393 train_time:159991ms step_avg:129.03ms step:1250/1393 val_loss:3.3164 train_time:160125ms step_avg:129.13ms step:1251/1393 train_time:160143ms step_avg:129.04ms step:1252/1393 train_time:160270ms step_avg:129.04ms step:1253/1393 train_time:160404ms step_avg:129.05ms step:1254/1393 train_time:160537ms step_avg:129.05ms step:1255/1393 train_time:160677ms step_avg:129.06ms step:1256/1393 train_time:160811ms step_avg:129.06ms step:1257/1393 train_time:160945ms step_avg:129.07ms step:1258/1393 train_time:161078ms step_avg:129.07ms step:1259/1393 train_time:161216ms step_avg:129.08ms step:1260/1393 train_time:161352ms step_avg:129.08ms step:1261/1393 train_time:161485ms step_avg:129.08ms step:1262/1393 train_time:161622ms step_avg:129.09ms step:1263/1393 train_time:161757ms step_avg:129.10ms step:1264/1393 train_time:161889ms step_avg:129.10ms step:1265/1393 train_time:162023ms step_avg:129.10ms step:1266/1393 train_time:162158ms step_avg:129.11ms step:1267/1393 train_time:162294ms step_avg:129.11ms step:1268/1393 train_time:162429ms step_avg:129.12ms step:1269/1393 train_time:162563ms step_avg:129.12ms step:1270/1393 train_time:162698ms step_avg:129.13ms step:1271/1393 train_time:162833ms step_avg:129.13ms step:1272/1393 train_time:162967ms step_avg:129.13ms step:1273/1393 train_time:163101ms step_avg:129.14ms step:1274/1393 train_time:163235ms step_avg:129.14ms step:1275/1393 train_time:163371ms step_avg:129.15ms step:1276/1393 train_time:163505ms step_avg:129.15ms step:1277/1393 train_time:163640ms step_avg:129.16ms step:1278/1393 train_time:163775ms step_avg:129.16ms step:1279/1393 train_time:163909ms step_avg:129.16ms step:1280/1393 train_time:164045ms step_avg:129.17ms step:1281/1393 train_time:164179ms step_avg:129.17ms step:1282/1393 train_time:164314ms step_avg:129.18ms step:1283/1393 train_time:164449ms step_avg:129.18ms step:1284/1393 train_time:164585ms step_avg:129.19ms step:1285/1393 train_time:164721ms step_avg:129.19ms step:1286/1393 train_time:164857ms step_avg:129.20ms step:1287/1393 train_time:164993ms step_avg:129.20ms step:1288/1393 train_time:165126ms step_avg:129.21ms step:1289/1393 train_time:165264ms step_avg:129.21ms step:1290/1393 train_time:165401ms step_avg:129.22ms step:1291/1393 train_time:165538ms step_avg:129.23ms step:1292/1393 train_time:165673ms step_avg:129.23ms step:1293/1393 train_time:165811ms step_avg:129.24ms step:1294/1393 train_time:165945ms step_avg:129.24ms step:1295/1393 train_time:166078ms step_avg:129.24ms step:1296/1393 train_time:166215ms step_avg:129.25ms step:1297/1393 train_time:166352ms step_avg:129.26ms step:1298/1393 train_time:166484ms step_avg:129.26ms step:1299/1393 train_time:166620ms step_avg:129.26ms step:1300/1393 train_time:166754ms step_avg:129.27ms step:1301/1393 train_time:166890ms step_avg:129.27ms step:1302/1393 train_time:167022ms step_avg:129.27ms step:1303/1393 train_time:167158ms step_avg:129.28ms step:1304/1393 train_time:167295ms step_avg:129.28ms step:1305/1393 train_time:167429ms step_avg:129.29ms step:1306/1393 train_time:167563ms step_avg:129.29ms step:1307/1393 train_time:167699ms step_avg:129.30ms step:1308/1393 train_time:167835ms step_avg:129.30ms step:1309/1393 train_time:167971ms step_avg:129.31ms step:1310/1393 train_time:168105ms step_avg:129.31ms step:1311/1393 train_time:168238ms step_avg:129.31ms step:1312/1393 train_time:168374ms step_avg:129.32ms step:1313/1393 train_time:168510ms step_avg:129.32ms step:1314/1393 train_time:168645ms step_avg:129.33ms step:1315/1393 train_time:168780ms step_avg:129.33ms step:1316/1393 train_time:168914ms step_avg:129.34ms step:1317/1393 train_time:169049ms step_avg:129.34ms step:1318/1393 train_time:169183ms step_avg:129.34ms step:1319/1393 train_time:169318ms step_avg:129.35ms step:1320/1393 train_time:169452ms step_avg:129.35ms step:1321/1393 train_time:169588ms step_avg:129.36ms step:1322/1393 train_time:169726ms step_avg:129.36ms step:1323/1393 train_time:169860ms step_avg:129.37ms step:1324/1393 train_time:169995ms step_avg:129.37ms step:1325/1393 train_time:170130ms step_avg:129.38ms step:1326/1393 train_time:170265ms step_avg:129.38ms step:1327/1393 train_time:170400ms step_avg:129.38ms step:1328/1393 train_time:170534ms step_avg:129.39ms step:1329/1393 train_time:170674ms step_avg:129.40ms step:1330/1393 train_time:170808ms step_avg:129.40ms step:1331/1393 train_time:170945ms step_avg:129.41ms step:1332/1393 train_time:171082ms step_avg:129.41ms step:1333/1393 train_time:171219ms step_avg:129.42ms step:1334/1393 train_time:171353ms step_avg:129.42ms step:1335/1393 train_time:171485ms step_avg:129.42ms step:1336/1393 train_time:171622ms step_avg:129.43ms step:1337/1393 train_time:171758ms step_avg:129.43ms step:1338/1393 train_time:171893ms step_avg:129.44ms step:1339/1393 train_time:172028ms step_avg:129.44ms step:1340/1393 train_time:172165ms step_avg:129.45ms step:1341/1393 train_time:172299ms step_avg:129.45ms step:1342/1393 train_time:172433ms step_avg:129.45ms step:1343/1393 train_time:172567ms step_avg:129.46ms step:1344/1393 train_time:172701ms step_avg:129.46ms step:1345/1393 train_time:172838ms step_avg:129.47ms step:1346/1393 train_time:172973ms step_avg:129.47ms step:1347/1393 train_time:173111ms step_avg:129.48ms step:1348/1393 train_time:173246ms step_avg:129.48ms step:1349/1393 train_time:173381ms step_avg:129.49ms step:1350/1393 train_time:173517ms step_avg:129.49ms step:1351/1393 train_time:173652ms step_avg:129.49ms step:1352/1393 train_time:173793ms step_avg:129.50ms step:1353/1393 train_time:173929ms step_avg:129.51ms step:1354/1393 train_time:174066ms step_avg:129.51ms step:1355/1393 train_time:174201ms step_avg:129.52ms step:1356/1393 train_time:174336ms step_avg:129.52ms step:1357/1393 train_time:174472ms step_avg:129.53ms step:1358/1393 train_time:174608ms step_avg:129.53ms step:1359/1393 train_time:174743ms step_avg:129.54ms step:1360/1393 train_time:174882ms step_avg:129.54ms step:1361/1393 train_time:175019ms step_avg:129.55ms step:1362/1393 train_time:175157ms step_avg:129.55ms step:1363/1393 train_time:175295ms step_avg:129.56ms step:1364/1393 train_time:175432ms step_avg:129.57ms step:1365/1393 train_time:175566ms step_avg:129.57ms step:1366/1393 train_time:175700ms step_avg:129.57ms step:1367/1393 train_time:175838ms step_avg:129.58ms step:1368/1393 train_time:175976ms step_avg:129.58ms step:1369/1393 train_time:176115ms step_avg:129.59ms step:1370/1393 train_time:176254ms step_avg:129.60ms step:1371/1393 train_time:176390ms step_avg:129.60ms step:1372/1393 train_time:176527ms step_avg:129.61ms step:1373/1393 train_time:176661ms step_avg:129.61ms step:1374/1393 train_time:176800ms step_avg:129.62ms step:1375/1393 train_time:176936ms step_avg:129.62ms step:1375/1393 val_loss:3.2820 train_time:177070ms step_avg:129.72ms step:1376/1393 train_time:177088ms step_avg:129.64ms step:1377/1393 train_time:177215ms step_avg:129.64ms step:1378/1393 train_time:177350ms step_avg:129.64ms step:1379/1393 train_time:177486ms step_avg:129.65ms step:1380/1393 train_time:177622ms step_avg:129.65ms step:1381/1393 train_time:177758ms step_avg:129.66ms step:1382/1393 train_time:177894ms step_avg:129.66ms step:1383/1393 train_time:178029ms step_avg:129.66ms step:1384/1393 train_time:178167ms step_avg:129.67ms step:1385/1393 train_time:178303ms step_avg:129.67ms step:1386/1393 train_time:178438ms step_avg:129.68ms step:1387/1393 train_time:178577ms step_avg:129.69ms step:1388/1393 train_time:178713ms step_avg:129.69ms step:1389/1393 train_time:178850ms step_avg:129.70ms step:1390/1393 train_time:178985ms step_avg:129.70ms step:1391/1393 train_time:179122ms step_avg:129.70ms step:1392/1393 train_time:179258ms step_avg:129.71ms step:1393/1393 train_time:179392ms step_avg:129.71ms step:1393/1393 val_loss:3.2785 train_time:179527ms step_avg:129.81ms peak memory allocated: 37653 MiB reserved: 39236 MiB